RubyGems - engtagger - Versions diffs - 0.1.1 → 0.2.2 - Mend

engtagger 0.1.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/lib/engtagger.rb CHANGED Viewed

@@ -1,729 +1,831 @@
-#! /local/ruby/bin/ruby
-$LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
-require 'rubygems'
-require 'kconv'
-require 'porter'
-# use hpricot for extracting English text from docs with XML like tags
-begin
-  require 'hpricot'
-rescue LoadError
-  $no_hpricot = true
-end
-# File paths
-$lexpath   = File.join(File.dirname(__FILE__), 'engtagger')
-$word_path = File.join($lexpath, "pos_words.hash")
-$tag_path  = File.join($lexpath, "pos_tags.hash")
-# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
-class Module
-  def memoize(method)
-    # alias_method is faster than define_method + old.bind(self).call
-    alias_method "__memoized__#{method}", method
-    module_eval <<-EOF
-      def #{method}(*a, &b)
-        # assumes the block won't change the result if the args are the same
-        (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
-      end
-    EOF
-  end
-end
-# English part-of-speech tagger class
-class EngTagger
-  VERSION = '0.1.1'
-  #################
-  # Class methods #
-  #################
-  # Return a class variable that holds probability data
-  def self.hmm
-    return @@hmm
-  end
-  # Return a class variable that holds lexical data
-  def self.lexicon
-    return @@lexicon
-  end
-  # Return a regexp from a string argument that matches an XML-style pos tag
-  def self.get_ext(tag = nil)
-    return nil unless tag
-    return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
-  end
-  # Regexps to match XML-style part-of-speech tags
-  NUM   = get_ext('cd')
-  GER   = get_ext('vbg')
-  ADJ   = get_ext('jj[rs]*')
-  PART  = get_ext('vbn')
-  NN    = get_ext('nn[sp]*')
-  NNP   = get_ext('nnp')
-  PREP  = get_ext('in')
-  DET   = get_ext('det')
-  PAREN = get_ext('[lr]rb')
-  QUOT  = get_ext('ppr')
-  SEN   = get_ext('pp')
-  WORD  = get_ext('\w+')
-  # Convert a Treebank-style, abbreviated tag into verbose definitions
-  def self.explain_tag(tag)
-    if TAGS[tag]
-      return TAGS[tag]
-    else
-      return tag
-    end
-  end
-  # The folloging is to make a hash to convert a pos tag to its definition
-  # used by the explain_tag method
-  tags = [
-    "CC",   "Conjunction, coordinating",
-    "CD",   "Adjective, cardinal number",
-    "DET",  "Determiner",
-    "EX",   "Pronoun, existential there",
-    "FW",   "Foreign words",
-    "IN",   "Preposition / Conjunction",
-    "JJ",   "Adjective",
-    "JJR",  "Adjective, comparative",
-    "JJS",  "Adjective, superlative",
-    "LS",   "Symbol, list item",
-    "MD",   "Verb, modal",
-    "NN",   "Noun",
-    "NNP",  "Noun, proper",
-    "NNPS", "Noun, proper, plural",
-    "NNS",  "Noun, plural",
-    "PDT",  "Determiner, prequalifier",
-    "POS",  "Possessive",
-    "PRP",  "Determiner, possessive second",
-    "PRPS", "Determiner, possessive",
-    "RB",   "Adverb",
-    "RBR",  "Adverb, comparative",
-    "RBS",  "Adverb, superlative",
-    "RP",   "Adverb, particle",
-    "SYM",  "Symbol",
-    "TO",   "Preposition",
-    "UH",   "Interjection",
-    "VB",   "Verb, infinitive",
-    "VBD",  "Verb, past tense",
-    "VBG",  "Verb, gerund",
-    "VBN",  "Verb, past/passive participle",
-    "VBP",  "Verb, base present form",
-    "VBZ",  "Verb, present 3SG -s form",
-    "WDT",  "Determiner, question",
-    "WP",   "Pronoun, question",
-    "WPS",  "Determiner, possessive & question",
-    "WRB",  "Adverb, question",
-    "PP",   "Punctuation, sentence ender",
-    "PPC",  "Punctuation, comma",
-    "PPD",  "Punctuation, dollar sign",
-    "PPL",  "Punctuation, quotation mark left",
-    "PPR",  "Punctuation, quotation mark right",
-    "PPS",  "Punctuation, colon, semicolon, elipsis",
-    "LRB",  "Punctuation, left bracket",
-    "RRB",  "Punctuation, right bracket"
-    ]
-  tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
-  tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
-  TAGS = Hash[*tags]
-  # Hash storing config values:
-  #
-  # * :unknown_word_tag
-  #    => (String) Tag to assign to unknown words
-  # * :stem
-  #    => (Boolean) Stem single words using Porter module
-  # * :weight_noun_phrases
-  #    => (Boolean) When returning occurrence counts for a noun phrase, multiply
-  #        the valuethe number of words in the NP.
-  # * :longest_noun_phrase
-  #    => (Integer) Will ignore noun phrases longer than this threshold. This
-  #        affects only the get_words() and get_nouns() methods.
-  # * :relax
-  #    => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
-  #        uncommon words, particularly words used polysemously
-  # * :tag_lex
-  #    => (String) Name of the YAML file containing a hash of adjacent part of
-  #         speech tags and the probability of each
-  # * :word_lex
-  #    => (String) Name of the YAML file containing a hash of words and corresponding
-  #        parts of speech
-  # * :unknown_lex
-  #    => (String) Name of the YAML file containing a hash of tags for unknown
-  #        words and corresponding parts of speech
-  # * :tag_path
-  #    => (String) Directory path of tag_lex
-  # * :word_path
-  #    => (String) Directory path of word_lex and unknown_lex
-  # * :debug
-  #    => (Boolean) Print debug messages
-  attr_accessor :conf
-  ###############
-  # Constructor #
-  ###############
-  # Take a hash of parameters that override default values.
-  # See above for details.
-  def initialize(params = {})
-    @conf = Hash.new
-    @conf[:unknown_word_tag] = ''
-    @conf[:stem] = false
-    @conf[:weight_noun_phrases] = false
-    @conf[:longest_noun_phrase] = 5
-    @conf[:relax] = false
-    @conf[:tag_lex] = 'tags.yml'
-    @conf[:word_lex] = 'words.yml'
-    @conf[:unknown_lex] = 'unknown.yml'
-    @conf[:word_path] = $word_path
-    @conf[:tag_path] = $tag_path
-    @conf[:debug] = false
-    # assuming that we start analyzing from the beginninga new sentence...
-    @conf[:current_tag] = 'pp'
-    @conf.merge(params) if params
-    unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
-      print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
-      @@hmm = Hash.new
-      @@lexicon = Hash.new
-    else
-      lexf = File.open(@conf[:word_path], 'r')
-      @@lexicon = Marshal.load(lexf)
-      lexf.close
-      hmmf = File.open(@conf[:tag_path], 'r')
-      @@hmm = Marshal.load(hmmf)
-      hmmf.close
-    end
-    @@mnp = get_max_noun_regex
-  end
-  ##################
-  # Public methods #
-  ##################
-  # Examine the string provided and return it fully tagged in XML style
-  def add_tags(text, verbose = false)
-    return nil unless valid_text(text)
-    tagged = []
-    words = clean_text(text)
-    tags = Array.new
-    words.each do |word|
-      cleaned_word = clean_word(word)
-      tag = assign_tag(@conf[:current_tag], cleaned_word)
-      @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
-      tag = EngTagger.explain_tag(tag) if verbose
-      tagged << '<' + tag + '>' + word + '</' + tag + '>'
-    end
-    reset
-    return tagged.join(' ')
-  end
-  # Given a text string, return as many nouns and noun phrases as possible.
-  # Applies add_tags and involves three stages:
-  #
-  # * Tag the text
-  # * Extract all the maximal noun phrases
-  # * Recursively extract all noun phrases from the MNPs
-  #
-  def get_words(text)
-    return false unless valid_text(text)
-    tagged = add_tags(text)
-    if(@conf[:longest_noun_phrase] <= 1)
-      return get_nouns(tagged)
-    else
-      return get_noun_phrases(tagged)
-    end
-  end
-  # Return an easy-on-the-eyes tagged version of a text string.
-  # Applies add_tags and reformats to be easier to read.
-  def get_readable(text, verbose = false)
-    return nil unless valid_text(text)
-    tagged = add_tags(text, verbose)
-    tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
-      $1 + '/' + $2.upcase
-    end
-    return tagged
-  end
-  # Return an array of sentences (without POS tags) from a text.
-  def get_sentences(text)
-    return nil unless valid_text(text)
-    tagged = add_tags(text)
-    sentences = Array.new
-    tagged.split(/<\/pp>/).each do |line|
-      sentences << strip_tags(line)
-    end
-    sentences = sentences.map do |sentence|
-      sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
-      sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
-      sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
-      sentence.gsub(Regexp.new(" (\W+)$")){$1}
-      sentence.gsub(Regexp.new("^(`+) ")){$1}
-    end
-    return sentences
-  end
-  # Given a POS-tagged text, this method returns a hash of all proper nouns
-  # and their occurrence frequencies. The method is greedy and will
-  # return multi-word phrases, if possible, so it would find ``Linguistic
-  # Data Consortium'' as a single unit, rather than as three individual
-  # proper nouns. This method does not stem the found words.
-  def get_proper_nouns(tagged)
-    return nil unless valid_text(tagged)
-    trimmed = tagged.scan(NNP).map do |n|
-      strip_tags(n)
-    end
-    nnp = Hash.new(0)
-    trimmed.each do |n|
-      next unless n.length < 100  # sanity check on word length
-      nnp[n] += 1 unless n =~ /\A\s*\z/
-    end
-    # Now for some fancy resolution stuff...
-    nnp.keys.each do |key|
-      words = key.split(/\s/)
-      # Let's say this is an organization's name --
-      # (and it's got at least three words)
-      # is there a corresponding acronym in this hash?
-      if words.length > 2
-        # Make a (naive) acronym out of this name
-        acronym = words.map do |word|
-          /\A([a-z])[a-z]*\z/ =~ word
-          $1
-        end.join ''
-        # If that acronym has been seen,
-        # remove it and add the values to
-        # the full name
-        if nnp[acronym]
-          nnp[key] += nnp[acronym]
-          nnp.delete(acronym)
-        end
-      end
-    end
-    return nnp
-  end
-  # Given a POS-tagged text, this method returns all nouns and their
-  # occurrence frequencies.
-  def get_nouns(tagged)
-    return nil unless valid_text(tagged)
-    NN
-    trimmed = tagged.scan(NN).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
-  end
-  # Given a POS-tagged text, this method returns only the maximal noun phrases.
-  # May be called directly, but is also used by get_noun_phrases
-  def get_max_noun_phrases(tagged)
-    return unless valid_text(tagged)
-    mn_phrases = tagged.scan(@@mnp).map do |m|
-      strip_tags(m)
-    end
-    ret = Hash.new(0)
-    mn_phrases.each do |p|
-      p = stem(p) unless p =~ /\s/  # stem single words
-      ret[p] += 1 unless p =~ /\A\s*\z/
-    end
-    return ret
-  end
-  # Similar to get_words, but requires a POS-tagged text as an argument.
-  def get_noun_phrases(tagged)
-    return nil unless valid_text(tagged)
-    found = Hash.new(0)
-    phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
-    scanned = tagged.scan(@@mnp)
-    # Find MNPs in the text, one sentence at a time
-    # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
-    mn_phrases = []
-    scanned.each do |m|
-      found[m] += 1 if phrase_ext =~ m
-      mn_phrases += m.split(phrase_ext)
-    end
-    mn_phrases.each do |mnp|
-     # Split the phrase into an array of words, and create a loop for each word,
-     # shortening the phrase by removing the word in the first position.
-     # Record the phrase and any single nouns that are found
-      words = mnp.split
-      words.length.times do |i|
-        found[words.join(' ')] += 1 if words.length > 1
-        w = words.shift
-        found[w] += 1 if w =~ /#{NN}/
-      end
-    end
-    ret = Hash.new(0)
-    found.keys.each do |f|
-      k = strip_tags(f)
-      v = found[f]
-      # We weight by the word count to favor long noun phrases
-      space_count = k.scan(/\s+/)
-      word_count = space_count.length + 1
-      # Throttle MNPs if necessary
-      next if word_count > @conf[:longest_noun_phrase]
-      k = stem(k) unless word_count > 1  # stem single words
-      multiplier = 1
-      multiplier = word_count if @conf[:weight_noun_phrases]
-      ret[k] += multiplier * v
-    end
-    return ret
-  end
-  # Reads some included corpus data and saves it in a stored hash on the
-  # local file system. This is called automatically if the tagger can't
-  # find the stored lexicon.
-  def install
-    puts "Creating part-of-speech lexicon" if @conf[:debug]
-    load_tags(@conf[:tag_lex])
-    load_words(@conf[:word_lex])
-    load_words(@conf[:unknown_lex])
-    File.open(@conf[:word_path], 'w') do |f|
-      Marshal.dump(@@lexicon, f)
-    end
-    File.open(@conf[:tag_path], 'w') do |f|
-      Marshal.dump(@@hmm, f)
-    end
-  end
-  ###################
-  # Private methods #
-  ###################
-  :private
-  # Downcase the first letter of word
-  def lcfirst(word)
-    word.split(//)[0].downcase + word.split(//)[1..-1].join
-  end
-  # Upcase the first letter of word
-  def ucfirst(word)
-    word.split(//)[0].upcase + word.split(//)[1..-1].join
-  end
-  # Return the word stem as given by Stemmable module. This can be
-  # turned off with the class parameter @conf[:stem] => false.
-  def stem(word)
-    return word unless @conf[:stem]
-    return word.stem
-  end
-  # This method will reset the preceeding tag to a sentence ender (PP).
-  # This prepares the first word of a new sentence to be tagged correctly.
-  def reset
-    @conf[:current_tag] = 'pp'
-  end
-  # Check whether the text is a valid string
-  def valid_text(text)
-    if !text
-      # there's nothing to parse
-      "method call on uninitialized variable" if @conf[:debug]
-      return false
-    elsif /\A\s*\z/ =~ text
-      # text is an empty string, nothing to parse
-      return false
-    else
-      # $text is valid
-      return true
-    end
-  end
-  # Return a text string with the part-of-speech tags removed
-  def strip_tags(tagged, downcase = false)
-    return nil unless valid_text(tagged)
-    text = tagged.gsub(/<[^>]+>/m, "")
-    text = text.gsub(/\s+/m, " ")
-    text = text.gsub(/\A\s*/, "")
-    text = text.gsub(/\s*\z/, "")
-    if downcase
-      return text.downcase
-    else
-      return text
-    end
-  end
-  # Strip the provided text of HTML-style tags and separate off any punctuation
-  # in preparation for tagging
-  def clean_text(text)
-    return false unless valid_text(text)
-    text = text.toutf8
-    unless $no_hpricot
-      # Strip out any markup and convert entities to their proper form
-      cleaned_text = Hpricot(text).inner_text
-    else
-      cleaned_text = text
-    end
-    tokenized = []
-    # Tokenize the text (splitting on punctuation as you go)
-    cleaned_text.split(/\s+/).each do |line|
-      tokenized += split_punct(line)
-    end
-    words = split_sentences(tokenized)
-    return words
-  end
-  # This handles all of the trailing periods, keeping those that
-  # belong on abbreviations and removing those that seem to be
-  # at the end of sentences. This method makes some assumptions
-  # about the use of capitalization in the incoming text
-  def split_sentences(array)
-    tokenized = array
-    people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
-                supt det mssrs rev)
-    army   = %w(col gen lt cmdr adm capt sgt cpl maj brig)
-    inst   = %w(dept univ assn bros ph.d)
-    place  = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
-                hwy hway la pde pd plz pl rd st tce)
-    comp   = %w(mfg inc ltd co corp)
-    state  = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
-                ind ia kans kan ken ky la me md is mass mich minn miss mo mont
-                neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
-                va wash wis wisc wy wyo usafa alta man ont que sask yuk)
-    month  = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
-    misc   = %w(vs etc no esp)
-    abbr = Hash.new
-    [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
-      abbr[i] = true
-    end
-    words = Array.new
-    tokenized.each_with_index do |t, i|
-      if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
-        w = $1
-        # Don't separate the period off words that
-        # meet any of the following conditions:
-        #
-        # 1. It is defined in one of the lists above
-        # 2. It is only one letter long: Alfred E. Sloan
-        # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
-        unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
-          words <<  w
-          words << '.'
-          next
-        end
-      end
-      words << tokenized[i]
-    end
-    # If the final word ends in a period..
-    if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
-      words[-1] = $1
-      words.push '.'
-    end
-    return words
-  end
-  # Separate punctuation from words, where appropriate. This leaves trailing
-  # periods in place to be dealt with later. Called by the clean_text method.
-  def split_punct(text)
-    # If there's no punctuation, return immediately
-    return [text] if /\A\w+\z/ =~ text
-    # Sanity checks
-    text = text.gsub(/\W{10,}/o, " ")
-    # Put quotes into a standard format
-    text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
-    text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
-    text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
-    text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
-    text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
-    # Handle all other punctuation
-    text = text.gsub(/--+/o, " - ") # Convert and separate dashes
-    text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
-    text = text.gsub(/:/o, " :") # Shift semicolons off
-    text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
-    text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
-    text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
-    # English-specific contractions
-    text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}  # Separate off 'd 'm 's
-    text = text.gsub(/n't\b/o, " n't")                     # Separate off n't
-    text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}         # Separate off 've, 'll, 're
-    result = text.split(' ')
-    return result
-  end
-  # Given a preceding tag, assign a tag word. Called by the add_tags method.
-  # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
-  def assign_tag(prev_tag, word)
-    if word == "-unknown-"
-      # classify unknown words accordingly
-      return @conf[:unknown_word_tag]
-    elsif word == "-sym-"
-      # If this is a symbol, tag it as a symbol
-      return "sym"
-    end
-    best_so_far = 0
-    w = @@lexicon[word]
-    t = @@hmm
-    # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
-    # which is used in most POS taggers
-    best_tag = ""
-    t[prev_tag].keys.each do |tag|
-      # With @config[:relax] set, this method
-      # will also include any `open classes' of POS tags
-      pw = 0
-      if w[tag]
-        pw = w[tag]
-      elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
-        pw = 0
-      else
-        next
-      end
-      # Bayesian logic:
-      # P =  P( tag | prev_tag ) * P( tag | word )
-      probability = t[prev_tag][tag] * (pw + 1)
-      # Set the tag with maximal probability
-      if probability > best_so_far
-        best_so_far = probability
-        best_tag = tag
-      end
-    end
-    return best_tag
-  end
-  # This method determines whether a word should be considered in its
-  # lower or upper case form. This is useful in considering proper nouns
-  # and words that begin sentences. Called by add_tags.
-  def clean_word(word)
-    lcf = lcfirst(word)
-    # seen this word as it appears (lower or upper case)
-    if @@lexicon[word]
-      return word
-    elsif @@lexicon[lcf]
-      # seen this word only as lower case
-      return lcf
-    else
-      # never seen this word. guess.
-      return classify_unknown_word(word)
-    end
-  end
-  # This changes any word not appearing in the lexicon to identifiable
-  # classes of words handled by a simple unknown word classification
-  # metric. Called by the clean_word method.
-  def classify_unknown_word(word)
-    if /[\(\{\[]/ =~ word  # Left brackets
-      classified = "*LRB*"
-    elsif
-      /[\)\}\]]/ =~ word   # Right brackets
-      classified = "*RRB*"
-    elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
-      classified = "*NUM*"
-    elsif /\A\d+[\d\/:-]+\d\z/ =~ word  # Other number constructs
-      classified = "*NUM*"
-    elsif /\A-?\d+\w+\z/o =~ word        # Ordinal number
-      classified = "*ORD*"
-    elsif /\A[A-Z][A-Z\.-]*\z/o =~ word  # Abbreviation (all caps)
-      classified = "-abr-"
-    elsif /\w-\w/o =~ word             # Hyphenated word
-      /-([^-]+)\z/ =~ word
-      h_suffix = $1
-      if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
-        # last part of this is defined as an adjective
-        classified = "-hyp-adj-"
-      else
-        # last part of this is not defined as an adjective
-        classified = "-hyp-"
-      end
-    elsif /\A\W+\z/o =~ word
-      classified = "-sym-"  # Symbol
-    elsif word == ucfirst(word)
-      classified = "-cap-"  # Capitalized word
-    elsif /ing\z/o =~ word
-      classified = "-ing-"  # Ends in 'ing'
-    elsif /s\z/o =~ word
-      classified = "-s-"    # Ends in 's'
-    elsif /tion\z/o =~ word
-      classified = "-tion-" # Ends in 'tion'
-    elsif /ly\z/o =~ word
-      classified = "-ly-"  # Ends in 'ly'
-    elsif /ed\z/o =~ word
-      classified = "-ed-"  # Ends in 'ed
-    else
-      classified = "-unknown-" # Completely unknown
-    end
-    return classified
-  end
-  # This returns a compiled regexp for extracting maximal noun phrases
-  # from a POS-tagged text.
-  def get_max_noun_regex
-    regex = /
-      # optional number, gerund - adjective -participle
-      (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
-        # Followed by one or more nouns
-        (?:#{NN})+
-          (?:
-            # Optional preposition, determinant, cardinal
-            (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
-              # Optional gerund-adjective -participle
-              (?:#{GER}|#{ADJ}|#{PART})*
-                # one or more nouns
-                (?:#{NN})+
-           )*
-    /xo #/
-    return regex
-  end
-  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
-  # YAML data parser. It will load a YAML document with a collection of key:
-  # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
-  # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
-  def load_tags(lexicon)
-    path = File.join($lexpath, lexicon)
-    fh = File.open(path, 'r')
-    while line = fh.gets
-      /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
-      next unless $1 and $2
-      key, data = $1, $2
-      tags = Hash.new
-      items = data.split(/,\s+/)
-      pairs = {}
-      items.each do |i|
-        /([^:]+):\s*(.+)/ =~ i
-        pairs[$1] = $2.to_f
-      end
-      @@hmm[key] = pairs
-    end
-    fh.close
-  end
-  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
-  # YAML data parser. It will load a YAML document with a collection of key:
-  # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
-  # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
-  def load_words(lexicon)
-    path = File.join($lexpath, lexicon)
-    fh = File.open(path, 'r')
-    while line = fh.gets
-      /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
-      next unless $1 and $2
-      key, data = $1, $2
-      tags = Hash.new
-      items = data.split(/,\s+/)
-      pairs = {}
-      items.each do |i|
-        /([^:]+):\s*(.+)/ =~ i
-        pairs[$1] = $2.to_f
-      end
-      @@lexicon[key] = pairs
-    end
-    fh.close
-  end
-  #memoize the stem and assign_tag methods
-  memoize("stem")
-  memoize("assign_tag")
-end
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+$LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
+require 'rubygems'
+require 'kconv'
+require 'porter'
+# use hpricot for extracting English text from docs with XML like tags
+begin
+  require 'hpricot'
+rescue LoadError
+  $no_hpricot = true
+end
+# File paths
+$lexpath   = File.join(File.dirname(__FILE__), 'engtagger')
+$word_path = File.join($lexpath, "pos_words.hash")
+$tag_path  = File.join($lexpath, "pos_tags.hash")
+# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
+class Module
+  def memoize(method)
+    # alias_method is faster than define_method + old.bind(self).call
+    alias_method "__memoized__#{method}", method
+    module_eval <<-EOF
+      def #{method}(*a, &b)
+        # assumes the block won't change the result if the args are the same
+        (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
+      end
+    EOF
+  end
+end
+# English part-of-speech tagger class
+class EngTagger
+  #################
+  # Class methods #
+  #################
+  # Return a class variable that holds probability data
+  def self.hmm
+    return @@hmm
+  end
+  # Return a class variable that holds lexical data
+  def self.lexicon
+    return @@lexicon
+  end
+  # Return a regexp from a string argument that matches an XML-style pos tag
+  def self.get_ext(tag = nil)
+    return nil unless tag
+    return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
+  end
+  # Regexps to match XML-style part-of-speech tags
+  NUM   = get_ext('cd')
+  GER   = get_ext('vbg')
+  ADJ   = get_ext('jj[rs]*')
+  NN    = get_ext('nn[sp]*')
+  NNP   = get_ext('nnp')
+  PREP  = get_ext('in')
+  DET   = get_ext('det')
+  PAREN = get_ext('[lr]rb')
+  QUOT  = get_ext('ppr')
+  SEN   = get_ext('pp')
+  WORD  = get_ext('\w+')
+  VB    = get_ext('vb')
+  VBG   = get_ext('vbg')
+  VBD   = get_ext('vbd')
+  PART  = get_ext('vbn')
+  VBP   = get_ext('vbp')
+  VBZ   = get_ext('vbz')
+  JJ    = get_ext('jj')
+  JJR   = get_ext('jjr')
+  JJS   = get_ext('jjs')
+  RB    = get_ext('rb')
+  RBR   = get_ext('rbr')
+  RBS   = get_ext('rbs')
+  RP    = get_ext('rp')
+  WRB   = get_ext('wrb')
+  WDT   = get_ext('wdt')
+  WP    = get_ext('wp')
+  WPS   = get_ext('wps')
+  CC    = get_ext('cc')
+  IN    = get_ext('in')
+  # Convert a Treebank-style, abbreviated tag into verbose definitions
+  def self.explain_tag(tag)
+    if TAGS[tag]
+      return TAGS[tag]
+    else
+      return tag
+    end
+  end
+  # The folloging is to make a hash to convert a pos tag to its definition
+  # used by the explain_tag method
+  tags = [
+    "CC",   "Conjunction, coordinating",
+    "CD",   "Adjective, cardinal number",
+    "DET",  "Determiner",
+    "EX",   "Pronoun, existential there",
+    "FW",   "Foreign words",
+    "IN",   "Preposition / Conjunction",
+    "JJ",   "Adjective",
+    "JJR",  "Adjective, comparative",
+    "JJS",  "Adjective, superlative",
+    "LS",   "Symbol, list item",
+    "MD",   "Verb, modal",
+    "NN",   "Noun",
+    "NNP",  "Noun, proper",
+    "NNPS", "Noun, proper, plural",
+    "NNS",  "Noun, plural",
+    "PDT",  "Determiner, prequalifier",
+    "POS",  "Possessive",
+    "PRP",  "Determiner, possessive second",
+    "PRPS", "Determiner, possessive",
+    "RB",   "Adverb",
+    "RBR",  "Adverb, comparative",
+    "RBS",  "Adverb, superlative",
+    "RP",   "Adverb, particle",
+    "SYM",  "Symbol",
+    "TO",   "Preposition",
+    "UH",   "Interjection",
+    "VB",   "Verb, infinitive",
+    "VBD",  "Verb, past tense",
+    "VBG",  "Verb, gerund",
+    "VBN",  "Verb, past/passive participle",
+    "VBP",  "Verb, base present form",
+    "VBZ",  "Verb, present 3SG -s form",
+    "WDT",  "Determiner, question",
+    "WP",   "Pronoun, question",
+    "WPS",  "Determiner, possessive & question",
+    "WRB",  "Adverb, question",
+    "PP",   "Punctuation, sentence ender",
+    "PPC",  "Punctuation, comma",
+    "PPD",  "Punctuation, dollar sign",
+    "PPL",  "Punctuation, quotation mark left",
+    "PPR",  "Punctuation, quotation mark right",
+    "PPS",  "Punctuation, colon, semicolon, elipsis",
+    "LRB",  "Punctuation, left bracket",
+    "RRB",  "Punctuation, right bracket"
+    ]
+  tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
+  tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
+  TAGS = Hash[*tags]
+  # Hash storing config values:
+  #
+  # * :unknown_word_tag
+  #    => (String) Tag to assign to unknown words
+  # * :stem
+  #    => (Boolean) Stem single words using Porter module
+  # * :weight_noun_phrases
+  #    => (Boolean) When returning occurrence counts for a noun phrase, multiply
+  #        the valuethe number of words in the NP.
+  # * :longest_noun_phrase
+  #    => (Integer) Will ignore noun phrases longer than this threshold. This
+  #        affects only the get_words() and get_nouns() methods.
+  # * :relax
+  #    => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
+  #        uncommon words, particularly words used polysemously
+  # * :tag_lex
+  #    => (String) Name of the YAML file containing a hash of adjacent part of
+  #         speech tags and the probability of each
+  # * :word_lex
+  #    => (String) Name of the YAML file containing a hash of words and corresponding
+  #        parts of speech
+  # * :unknown_lex
+  #    => (String) Name of the YAML file containing a hash of tags for unknown
+  #        words and corresponding parts of speech
+  # * :tag_path
+  #    => (String) Directory path of tag_lex
+  # * :word_path
+  #    => (String) Directory path of word_lex and unknown_lex
+  # * :debug
+  #    => (Boolean) Print debug messages
+  attr_accessor :conf
+  ###############
+  # Constructor #
+  ###############
+  # Take a hash of parameters that override default values.
+  # See above for details.
+  def initialize(params = {})
+    @conf = Hash.new
+    @conf[:unknown_word_tag] = ''
+    @conf[:stem] = false
+    @conf[:weight_noun_phrases] = false
+    @conf[:longest_noun_phrase] = 5
+    @conf[:relax] = false
+    @conf[:tag_lex] = 'tags.yml'
+    @conf[:word_lex] = 'words.yml'
+    @conf[:unknown_lex] = 'unknown.yml'
+    @conf[:word_path] = $word_path
+    @conf[:tag_path] = $tag_path
+    @conf[:debug] = false
+    # assuming that we start analyzing from the beginninga new sentence...
+    @conf[:current_tag] = 'pp'
+    @conf.merge!(params)
+    unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
+      print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
+      @@hmm = Hash.new
+      @@lexicon = Hash.new
+    else
+      lexf = File.open(@conf[:word_path], 'r')
+      @@lexicon = Marshal.load(lexf)
+      lexf.close
+      hmmf = File.open(@conf[:tag_path], 'r')
+      @@hmm = Marshal.load(hmmf)
+      hmmf.close
+    end
+    @@mnp = get_max_noun_regex
+  end
+  ##################
+  # Public methods #
+  ##################
+  # Examine the string provided and return it fully tagged in XML style
+  def add_tags(text, verbose = false)
+    return nil unless valid_text(text)
+    tagged = []
+    words = clean_text(text)
+    tags = Array.new
+    words.each do |word|
+      cleaned_word = clean_word(word)
+      tag = assign_tag(@conf[:current_tag], cleaned_word)
+      @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
+      tag = EngTagger.explain_tag(tag) if verbose
+      tagged << '<' + tag + '>' + word + '</' + tag + '>'
+    end
+    reset
+    return tagged.join(' ')
+  end
+  # Given a text string, return as many nouns and noun phrases as possible.
+  # Applies add_tags and involves three stages:
+  #
+  # * Tag the text
+  # * Extract all the maximal noun phrases
+  # * Recursively extract all noun phrases from the MNPs
+  #
+  def get_words(text)
+    return false unless valid_text(text)
+    tagged = add_tags(text)
+    if(@conf[:longest_noun_phrase] <= 1)
+      return get_nouns(tagged)
+    else
+      return get_noun_phrases(tagged)
+    end
+  end
+  # Return an easy-on-the-eyes tagged version of a text string.
+  # Applies add_tags and reformats to be easier to read.
+  def get_readable(text, verbose = false)
+    return nil unless valid_text(text)
+    tagged = add_tags(text, verbose)
+    tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
+      $1 + '/' + $2.upcase
+    end
+    return tagged
+  end
+  # Return an array of sentences (without POS tags) from a text.
+  def get_sentences(text)
+    return nil unless valid_text(text)
+    tagged = add_tags(text)
+    sentences = Array.new
+    tagged.split(/<\/pp>/).each do |line|
+      sentences << strip_tags(line)
+    end
+    sentences = sentences.map do |sentence|
+      sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
+      sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
+      sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
+      sentence.gsub(Regexp.new(" (\W+)$")){$1}
+      sentence.gsub(Regexp.new("^(`+) ")){$1}
+    end
+    return sentences
+  end
+  # Given a POS-tagged text, this method returns a hash of all proper nouns
+  # and their occurrence frequencies. The method is greedy and will
+  # return multi-word phrases, if possible, so it would find ``Linguistic
+  # Data Consortium'' as a single unit, rather than as three individual
+  # proper nouns. This method does not stem the found words.
+  def get_proper_nouns(tagged)
+    return nil unless valid_text(tagged)
+    tags = [NNP]
+    nnp = build_matches_hash(build_trimmed(tagged, tags))
+    # Now for some fancy resolution stuff...
+    nnp.keys.each do |key|
+      words = key.split(/\s/)
+      # Let's say this is an organization's name --
+      # (and it's got at least three words)
+      # is there a corresponding acronym in this hash?
+      if words.length > 2
+        # Make a (naive) acronym out of this name
+        acronym = words.map do |word|
+          /\A([a-z])[a-z]*\z/ =~ word
+          $1
+        end.join ''
+        # If that acronym has been seen,
+        # remove it and add the values to
+        # the full name
+        if nnp[acronym]
+          nnp[key] += nnp[acronym]
+          nnp.delete(acronym)
+        end
+      end
+    end
+    return nnp
+  end
+  # Given a POS-tagged text, this method returns all nouns and their
+  # occurrence frequencies.
+  def get_nouns(tagged)
+    return nil unless valid_text(tagged)
+    tags = [NN]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  # Returns all types of verbs and does not descriminate between the various kinds.
+  # Is the combination of all other verb methods listed in this class.
+  def get_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VB, VBD, VBG, PART, VBP, VBZ]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_infinitive_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VB]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_past_tense_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VBD]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_gerund_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VBG]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_passive_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [PART]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_base_present_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VBP]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_present_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VBZ]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_adjectives(tagged)
+    return nil unless valid_text(tagged)
+    tags = [JJ]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_comparative_adjectives(tagged)
+    return nil unless valid_text(tagged)
+    tags = [JJR]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_superlative_adjectives(tagged)
+    return nil unless valid_text(tagged)
+    tags = [JJS]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_adverbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [RB, RBR, RBS, RP]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  def get_interrogatives(tagged)
+    return nil unless valid_text(tagged)
+    tags = [WRB, WDT, WP, WPS]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  # To be consistent with documentation's naming of 'interrogative' parts of speech as 'question'
+  alias_method :get_question_parts, :get_interrogatives
+  # Returns all types of conjunctions and does not discriminate between the various kinds.
+  # E.g. coordinating, subordinating, correlative...
+  def get_conjunctions(tagged)
+    return nil unless valid_text(tagged)
+    tags = [CC, IN]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  # Given a POS-tagged text, this method returns only the maximal noun phrases.
+  # May be called directly, but is also used by get_noun_phrases
+  def get_max_noun_phrases(tagged)
+    return nil unless valid_text(tagged)
+    tags = [@@mnp]
+    mn_phrases = build_trimmed(tagged, tags)
+    ret = Hash.new(0)
+    mn_phrases.each do |p|
+      p = stem(p) unless p =~ /\s/  # stem single words
+      ret[p] += 1 unless p =~ /\A\s*\z/
+    end
+    return ret
+  end
+  # Similar to get_words, but requires a POS-tagged text as an argument.
+  def get_noun_phrases(tagged)
+    return nil unless valid_text(tagged)
+    found = Hash.new(0)
+    phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
+    scanned = tagged.scan(@@mnp)
+    # Find MNPs in the text, one sentence at a time
+    # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
+    mn_phrases = []
+    scanned.each do |m|
+      found[m] += 1 if phrase_ext =~ m
+      mn_phrases += m.split(phrase_ext)
+    end
+    mn_phrases.each do |mnp|
+     # Split the phrase into an array of words, and create a loop for each word,
+     # shortening the phrase by removing the word in the first position.
+     # Record the phrase and any single nouns that are found
+      words = mnp.split
+      words.length.times do |i|
+        found[words.join(' ')] += 1 if words.length > 1
+        w = words.shift
+        found[w] += 1 if w =~ /#{NN}/
+      end
+    end
+    ret = Hash.new(0)
+    found.keys.each do |f|
+      k = strip_tags(f)
+      v = found[f]
+      # We weight by the word count to favor long noun phrases
+      space_count = k.scan(/\s+/)
+      word_count = space_count.length + 1
+      # Throttle MNPs if necessary
+      next if word_count > @conf[:longest_noun_phrase]
+      k = stem(k) unless word_count > 1  # stem single words
+      multiplier = 1
+      multiplier = word_count if @conf[:weight_noun_phrases]
+      ret[k] += multiplier * v
+    end
+    return ret
+  end
+  # Reads some included corpus data and saves it in a stored hash on the
+  # local file system. This is called automatically if the tagger can't
+  # find the stored lexicon.
+  def install
+    puts "Creating part-of-speech lexicon" if @conf[:debug]
+    load_tags(@conf[:tag_lex])
+    load_words(@conf[:word_lex])
+    load_words(@conf[:unknown_lex])
+    File.open(@conf[:word_path], 'w') do |f|
+      Marshal.dump(@@lexicon, f)
+    end
+    File.open(@conf[:tag_path], 'w') do |f|
+      Marshal.dump(@@hmm, f)
+    end
+  end
+  ###################
+  # Private methods #
+  ###################
+  :private
+  def build_trimmed(tagged, tags)
+    tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
+      strip_tags(n)
+    end
+  end
+  def build_matches_hash(trimmed)
+    ret = Hash.new(0)
+    trimmed.each do |n|
+      n = stem(n)
+      next unless n.length < 100 # sanity check on word length
+      ret[n] += 1 unless n =~ /\A\s*\z/
+    end
+    ret
+  end
+  # Downcase the first letter of word
+  def lcfirst(word)
+    word.split(//)[0].downcase + word.split(//)[1..-1].join
+  end
+  # Upcase the first letter of word
+  def ucfirst(word)
+    word.split(//)[0].upcase + word.split(//)[1..-1].join
+  end
+  # Return the word stem as given by Stemmable module. This can be
+  # turned off with the class parameter @conf[:stem] => false.
+  def stem(word)
+    return word unless @conf[:stem]
+    return word.stem
+  end
+  # This method will reset the preceeding tag to a sentence ender (PP).
+  # This prepares the first word of a new sentence to be tagged correctly.
+  def reset
+    @conf[:current_tag] = 'pp'
+  end
+  # Check whether the text is a valid string
+  def valid_text(text)
+    if !text
+      # there's nothing to parse
+      "method call on uninitialized variable" if @conf[:debug]
+      return false
+    elsif /\A\s*\z/ =~ text
+      # text is an empty string, nothing to parse
+      return false
+    else
+      # $text is valid
+      return true
+    end
+  end
+  # Return a text string with the part-of-speech tags removed
+  def strip_tags(tagged, downcase = false)
+    return nil unless valid_text(tagged)
+    text = tagged.gsub(/<[^>]+>/m, "")
+    text = text.gsub(/\s+/m, " ")
+    text = text.gsub(/\A\s*/, "")
+    text = text.gsub(/\s*\z/, "")
+    if downcase
+      return text.downcase
+    else
+      return text
+    end
+  end
+  # Strip the provided text of HTML-style tags and separate off any punctuation
+  # in preparation for tagging
+  def clean_text(text)
+    return false unless valid_text(text)
+    text = text.toutf8
+    unless $no_hpricot
+      # Strip out any markup and convert entities to their proper form
+      cleaned_text = Hpricot(text).inner_text
+    else
+      cleaned_text = text
+    end
+    tokenized = []
+    # Tokenize the text (splitting on punctuation as you go)
+    cleaned_text.split(/\s+/).each do |line|
+      tokenized += split_punct(line)
+    end
+    words = split_sentences(tokenized)
+    return words
+  end
+  # This handles all of the trailing periods, keeping those that
+  # belong on abbreviations and removing those that seem to be
+  # at the end of sentences. This method makes some assumptions
+  # about the use of capitalization in the incoming text
+  def split_sentences(array)
+    tokenized = array
+    people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
+                supt det mssrs rev)
+    army   = %w(col gen lt cmdr adm capt sgt cpl maj brig)
+    inst   = %w(dept univ assn bros ph.d)
+    place  = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
+                hwy hway la pde pd plz pl rd st tce)
+    comp   = %w(mfg inc ltd co corp)
+    state  = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
+                ind ia kans kan ken ky la me md is mass mich minn miss mo mont
+                neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
+                va wash wis wisc wy wyo usafa alta man ont que sask yuk)
+    month  = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
+    misc   = %w(vs etc no esp)
+    abbr = Hash.new
+    [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
+      abbr[i] = true
+    end
+    words = Array.new
+    tokenized.each_with_index do |t, i|
+      if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
+        w = $1
+        # Don't separate the period off words that
+        # meet any of the following conditions:
+        #
+        # 1. It is defined in one of the lists above
+        # 2. It is only one letter long: Alfred E. Sloan
+        # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
+        unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
+          words <<  w
+          words << '.'
+          next
+        end
+      end
+      words << tokenized[i]
+    end
+    # If the final word ends in a period..
+    if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
+      words[-1] = $1
+      words.push '.'
+    end
+    return words
+  end
+  # Separate punctuation from words, where appropriate. This leaves trailing
+  # periods in place to be dealt with later. Called by the clean_text method.
+  def split_punct(text)
+    # If there's no punctuation, return immediately
+    return [text] if /\A\w+\z/ =~ text
+    # Sanity checks
+    text = text.gsub(/\W{10,}/o, " ")
+    # Put quotes into a standard format
+    text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
+    text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
+    text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
+    text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
+    text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
+    # Handle all other punctuation
+    text = text.gsub(/--+/o, " - ") # Convert and separate dashes
+    text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
+    text = text.gsub(/:/o, " :") # Shift semicolons off
+    text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
+    text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
+    text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
+    # English-specific contractions
+    text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}  # Separate off 'd 'm 's
+    text = text.gsub(/n't\b/o, " n't")                     # Separate off n't
+    text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}         # Separate off 've, 'll, 're
+    result = text.split(' ')
+    return result
+  end
+  # Given a preceding tag, assign a tag word. Called by the add_tags method.
+  # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
+  def assign_tag(prev_tag, word)
+    if word == "-unknown-"
+      # classify unknown words accordingly
+      return @conf[:unknown_word_tag]
+    elsif word == "-sym-"
+      # If this is a symbol, tag it as a symbol
+      return "sym"
+    end
+    best_so_far = 0
+    w = @@lexicon[word]
+    t = @@hmm
+    # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
+    # which is used in most POS taggers
+    best_tag = ""
+    t[prev_tag].keys.each do |tag|
+      # With @config[:relax] set, this method
+      # will also include any `open classes' of POS tags
+      pw = 0
+      if w[tag]
+        pw = w[tag]
+      elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
+        pw = 0
+      else
+        next
+      end
+      # Bayesian logic:
+      # P =  P( tag | prev_tag ) * P( tag | word )
+      probability = t[prev_tag][tag] * (pw + 1)
+      # Set the tag with maximal probability
+      if probability > best_so_far
+        best_so_far = probability
+        best_tag = tag
+      end
+    end
+    return best_tag
+  end
+  # This method determines whether a word should be considered in its
+  # lower or upper case form. This is useful in considering proper nouns
+  # and words that begin sentences. Called by add_tags.
+  def clean_word(word)
+    lcf = lcfirst(word)
+    # seen this word as it appears (lower or upper case)
+    if @@lexicon[word]
+      return word
+    elsif @@lexicon[lcf]
+      # seen this word only as lower case
+      return lcf
+    else
+      # never seen this word. guess.
+      return classify_unknown_word(word)
+    end
+  end
+  # This changes any word not appearing in the lexicon to identifiable
+  # classes of words handled by a simple unknown word classification
+  # metric. Called by the clean_word method.
+  def classify_unknown_word(word)
+    if /[\(\{\[]/ =~ word  # Left brackets
+      classified = "*LRB*"
+    elsif
+      /[\)\}\]]/ =~ word   # Right brackets
+      classified = "*RRB*"
+    elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
+      classified = "*NUM*"
+    elsif /\A\d+[\d\/:-]+\d\z/ =~ word  # Other number constructs
+      classified = "*NUM*"
+    elsif /\A-?\d+\w+\z/o =~ word        # Ordinal number
+      classified = "*ORD*"
+    elsif /\A[A-Z][A-Z\.-]*\z/o =~ word  # Abbreviation (all caps)
+      classified = "-abr-"
+    elsif /\w-\w/o =~ word             # Hyphenated word
+      /-([^-]+)\z/ =~ word
+      h_suffix = $1
+      if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
+        # last part of this is defined as an adjective
+        classified = "-hyp-adj-"
+      else
+        # last part of this is not defined as an adjective
+        classified = "-hyp-"
+      end
+    elsif /\A\W+\z/o =~ word
+      classified = "-sym-"  # Symbol
+    elsif word == ucfirst(word)
+      classified = "-cap-"  # Capitalized word
+    elsif /ing\z/o =~ word
+      classified = "-ing-"  # Ends in 'ing'
+    elsif /s\z/o =~ word
+      classified = "-s-"    # Ends in 's'
+    elsif /tion\z/o =~ word
+      classified = "-tion-" # Ends in 'tion'
+    elsif /ly\z/o =~ word
+      classified = "-ly-"  # Ends in 'ly'
+    elsif /ed\z/o =~ word
+      classified = "-ed-"  # Ends in 'ed
+    else
+      classified = "-unknown-" # Completely unknown
+    end
+    return classified
+  end
+  # This returns a compiled regexp for extracting maximal noun phrases
+  # from a POS-tagged text.
+  def get_max_noun_regex
+    regex = /
+      # optional number, gerund - adjective -participle
+      (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
+        # Followed by one or more nouns
+        (?:#{NN})+
+          (?:
+            # Optional preposition, determinant, cardinal
+            (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
+              # Optional gerund-adjective -participle
+              (?:#{GER}|#{ADJ}|#{PART})*
+                # one or more nouns
+                (?:#{NN})+
+           )*
+    /xo #/
+    return regex
+  end
+  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
+  # YAML data parser. It will load a YAML document with a collection of key:
+  # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
+  # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
+  def load_tags(lexicon)
+    path = File.join($lexpath, lexicon)
+    fh = File.open(path, 'r')
+    while line = fh.gets
+      /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
+      next unless $1 and $2
+      key, data = $1, $2
+      tags = Hash.new
+      items = data.split(/,\s+/)
+      pairs = {}
+      items.each do |i|
+        /([^:]+):\s*(.+)/ =~ i
+        pairs[$1] = $2.to_f
+      end
+      @@hmm[key] = pairs
+    end
+    fh.close
+  end
+  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
+  # YAML data parser. It will load a YAML document with a collection of key:
+  # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
+  # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
+  def load_words(lexicon)
+    path = File.join($lexpath, lexicon)
+    fh = File.open(path, 'r')
+    while line = fh.gets
+      /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
+      next unless $1 and $2
+      key, data = $1, $2
+      tags = Hash.new
+      items = data.split(/,\s+/)
+      pairs = {}
+      items.each do |i|
+        /([^:]+):\s*(.+)/ =~ i
+        pairs[$1] = $2.to_f
+      end
+      @@lexicon[key] = pairs
+    end
+    fh.close
+  end
+  #memoize the stem and assign_tag methods
+  memoize("stem")
+  memoize("assign_tag")
+end