RubyGems - excite - Versions diffs - 2.1.1 - Mend

excite 2.1.1

Files changed (44) hide show

data/.gitignore +11 -0
data/.rspec +1 -0
data/Gemfile +8 -0
data/Gemfile.lock +69 -0
data/LICENSE +22 -0
data/README.md +46 -0
data/Rakefile +24 -0
data/config/citation_cleanup_rules.yml +68 -0
data/config/parscit_features.yml +55 -0
data/excite.gemspec +30 -0
data/lib/excite/array_helpers.rb +27 -0
data/lib/excite/citation.rb +48 -0
data/lib/excite/crfparser.rb +322 -0
data/lib/excite/postprocessor.rb +252 -0
data/lib/excite/preprocessor.rb +107 -0
data/lib/excite/resources/dicts/female-names +4954 -0
data/lib/excite/resources/dicts/first-names +27926 -0
data/lib/excite/resources/dicts/male-names +3901 -0
data/lib/excite/resources/dicts/months +24 -0
data/lib/excite/resources/dicts/places +43109 -0
data/lib/excite/resources/dicts/publishers +654 -0
data/lib/excite/resources/dicts/surnames +146259 -0
data/lib/excite/resources/html.template +84 -0
data/lib/excite/resources/html_model +0 -0
data/lib/excite/resources/model +0 -0
data/lib/excite/resources/parsCit.template +76 -0
data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
data/lib/excite/resources/trainingdata/verify.rb +97 -0
data/lib/excite/token_features.rb +313 -0
data/lib/excite/version.rb +7 -0
data/lib/excite.rb +13 -0
data/model/test/analysis.csv +54 -0
data/model/test/array_helpers.rb +30 -0
data/model/test/html-analysis.csv +60 -0
data/model/test/html-output.txt +19893 -0
data/model/test/model_test.rb +306 -0
data/model/test/output.txt +16742 -0
data/spec/excite/citation_spec.rb +128 -0
data/spec/excite/crfparser_spec.rb +118 -0
data/spec/excite/postprocessor_spec.rb +68 -0
data/spec/excite/token_features_spec.rb +641 -0
data/spec/spec_helper.rb +4 -0
metadata +222 -0

data/lib/excite/crfparser.rb ADDED Viewed

@@ -0,0 +1,322 @@
+# encoding: UTF-8
+require 'tempfile'
+require 'nokogiri'
+require 'cgi'
+require 'engtagger'
+module Excite
+  class CRFParser
+    attr_reader :feature_order
+    attr_reader :token_features
+    include TokenFeatures
+    include Preprocessor
+    include Postprocessor
+    DIR = File.dirname(__FILE__)
+    TAGGED_REFERENCES = "#{DIR}/resources/trainingdata/tagged_references.txt"
+    TAGGED_HTML_REFERENCES = "#{DIR}/resources/trainingdata/tagged_html_references.txt"
+    TRAINING_DATA = "#{DIR}/resources/trainingdata/training_data.txt"
+    MODEL_FILE = "#{DIR}/resources/model"
+    HTML_MODEL_FILE = "#{DIR}/resources/html_model"
+    TEMPLATE_FILE = "#{DIR}/resources/parsCit.template"
+    HTML_TEMPLATE_FILE = "#{DIR}/resources/html.template"
+    CONFIG_FILE = "#{DIR}/../../config/parscit_features.yml"
+    # Feature functions must be performed in alphabetical order, since
+    # later functions may depend on earlier ones.
+    # TODO This seems pretty confusing and dependent on the current features.
+    def initialize(mode=:string)
+      @mode = mode
+      f = File.open(CONFIG_FILE, 'r')
+      hsh = YAML::load(f)[mode.to_s]
+      @feature_order = hsh["feature_order"].map(&:to_sym)
+      @token_features = hsh["feature_order"].sort.map(&:to_sym)
+    end
+    def model
+      @model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
+    end
+    def parse(str, presumed_author=nil)
+      raw_string = str.dup
+      toks, features = str_2_features(str, false, presumed_author)
+      tags, overall_prob, tag_probs = eval_crfpp(features, model)
+      ret = {}
+      tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
+      ret.each { |k, v| ret[k] = v.join('').strip }
+      normalize_fields(ret)
+      ret['raw_string'] = raw_string
+      [ret, overall_prob, tag_probs]
+    end
+    def eval_crfpp(feat_seq, model)
+      model.clear
+      feat_seq.each {|vec|
+        line = vec.join(" ").strip
+        raise unless model.add(line)
+      }
+      raise unless model.parse
+      tags = []
+      probs = {}
+      feat_seq.length.times {|i|
+        tags << model.y2(i)
+        probs[model.y2(i)] ||= 1
+        probs[model.y2(i)] *= model.prob(i)
+      }
+      [tags, model.prob, probs]
+    end
+    def self.strip_punct(str)
+      toknp = str.gsub(/[^\w]/, '')
+      toknp = "EMPTY" if toknp.blank? # TODO Seems maybe hacky
+      toknp
+    end
+    def normalize_input_author(str)
+      return nil if str.blank?
+      str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
+    end
+    def prepare_token_data(raw_string, training=false)
+      if training
+        tags = tagged_string_2_tags(raw_string.strip)
+        labels, raw_string, joined_tokens = [], '', ''
+        tags.each do |tag|
+          raw = CGI.unescapeHTML(tag.inner_html)
+          label = tag.name
+          raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)
+          toks = str_2_tokens(raw)
+          labels << [label, joined_tokens.length]
+          joined_tokens += toks.map(&:raw).join
+          raw_string += "\n#{raw}"
+        end
+      end
+      tokens = str_2_tokens(raw_string.strip)
+      if training
+        joined_tokens = ''
+        label, _ = labels.shift
+        next_label, end_idx = labels.shift unless labels.empty?
+        tokens.each do |tok|
+          tok.label = label
+          joined_tokens += tok.raw
+          if joined_tokens.length == end_idx
+            label = next_label
+            next_label, end_idx = labels.shift unless labels.empty?
+          elsif joined_tokens.length > end_idx && !labels.empty?
+            raise "Tokens do not match labels"
+          end
+        end
+        raise "Unused label" unless labels.empty?
+      end
+      self.clear
+      return tokens
+    end
+    def tagger
+      @tagger ||= EngTagger.new
+    end
+    def tagged_string_2_tags(str)
+      str = "<string>#{str}</string>"
+      node = Nokogiri::XML.fragment(str).css('string')
+      node.children.reject(&:text?)
+    end
+    def str_2_tokens(str)
+      if @mode == :html
+        toks = html_str_2_tokens(str)
+      elsif @mode == :string
+        toks = text_str_2_tokens(str)
+      end
+      toks.reject { |t| t.empty? }
+    end
+    def recognized_labels
+      if @mode == :string
+        ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
+      elsif @mode == :html
+        ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
+      else
+        []
+      end
+    end
+    def html_str_2_tokens(str)
+      html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad
+      tokens = []
+      html.traverse do |node|
+        tokens += html_text_node_2_tokens(node) if node.text?
+      end
+      tokens
+    end
+    def html_text_node_2_tokens(node)
+      text = CGI.unescapeHTML(node.text)
+      return [] if text.blank?
+      tokens = text_str_2_tokens(text)
+      tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
+      tokens
+    end
+    def text_str_2_tokens(text)
+      tagged = tagger.add_tags(normalize_citation(text))
+      tags = tagged_string_2_tags(tagged.gsub('&','&amp;')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
+      tags.map { |tag| Token.new(tag.text, tag.name) }
+    end
+    # calculate features on the full citation string
+    def str_2_features(raw_string, training=false, presumed_author=nil)
+      features = []
+      tokens = prepare_token_data(raw_string, training)
+      author_names = normalize_input_author(presumed_author)
+      tokens.each_with_index do |tok, toki|
+        raise "All tokens must be labeled" if training && tok.label.nil?
+        feats = {}
+        @token_features.each {|f|
+          feats[f] = self.send(f, tokens, toki, author_names)
+        }
+        features << [tok.raw]
+        @feature_order.each {|f| features.last << feats[f]}
+        features.last << tok.label if training
+      end
+      [tokens, features]
+    end
+    def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
+      tagged_refs ||= default_tagged_references
+      fin = File.open(tagged_refs, 'r')
+      fout = File.open(training_data, 'w')
+      x = 0
+      while l = fin.gets
+        _, data = str_2_features(l.strip, true)
+        data.each {|line| fout.write("#{line.join(" ")}\n") }
+        fout.write("\n")
+      end
+      fin.close
+      fout.flush
+      fout.close
+    end
+    def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
+      tagged_refs ||= default_tagged_references
+      model ||= default_model_file
+      template ||= default_template_file
+      if training_data.nil?
+        training_data = TRAINING_DATA
+        write_training_file(tagged_refs, training_data)
+      end
+      `crf_learn #{template} #{training_data} #{model} -f3 1>&2`
+    end
+    def default_tagged_references
+      if @mode == :string
+        TAGGED_REFERENCES
+      elsif @mode == :html
+        TAGGED_HTML_REFERENCES
+      else
+        raise "Unknown mode: #{@mode}"
+      end
+    end
+    def default_model_file
+      if @mode == :string
+        MODEL_FILE
+      elsif @mode == :html
+        HTML_MODEL_FILE
+      else
+        raise "Unknown mode: #{@mode}"
+      end
+    end
+    def default_template_file
+      if @mode == :string
+        TEMPLATE_FILE
+      elsif @mode == :html
+        HTML_TEMPLATE_FILE
+      else
+        raise "Unknown mode: #{@mode}"
+      end
+    end
+  end
+  class TrainingError < Exception; end
+  class Token
+    attr_reader :node, :idx_in_node, :node_token_count, :part_of_speech
+    attr_accessor :label
+    def initialize(str, part_of_speech=nil)
+      @str = str
+      @part_of_speech = part_of_speech
+    end
+    def is_in_node!(node, idx_in_node, node_token_count)
+      @node = node
+      @idx_in_node = idx_in_node
+      @node_token_count = node_token_count
+    end
+    def raw
+      @str
+    end
+    def np
+      @np ||= CRFParser.strip_punct(@str)
+    end
+    def lcnp
+      @lcnp ||= np == "EMPTY" ? np : np.downcase
+    end
+    def empty?
+      raw.strip.blank?
+    end
+    def to_s
+      "{#{raw}}"
+    end
+    def for_join(prev)
+      if ['pp','ppc','ppr','pps','rrb', 'pos'].include?(part_of_speech)
+        raw
+      elsif prev && ['ppd','ppl','lrb'].include?(prev.part_of_speech)
+        raw
+      else
+        " "+raw
+      end
+    end
+  end
+end

data/lib/excite/postprocessor.rb ADDED Viewed

@@ -0,0 +1,252 @@
+# encoding: UTF-8
+module Excite
+  module Postprocessor
+    def normalize_fields(citation_hsh)
+      citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
+      citation_hsh
+    end
+    def method_missing(m, *args, &block)
+      # Call normalize on any fields that don't have their own normalization
+      # method defined
+      if m.to_s =~ /^normalize/
+        m.to_s =~ /normalize_(.*)$/
+        normalize($1, *args)
+      else super
+      end
+    end
+    # default normalization function for all fields that do not have their
+    # own normalization
+    # Strip any leading and/or trailing punctuation and space
+    def normalize(key, hsh)
+      hsh[key].gsub!(/^[^A-Za-z0-9]+/, '')
+      hsh[key].gsub!(/[^A-Za-z0-9]+$/, '')
+    end
+    # strip leading numerals
+    # if the real title is quoted inside this string, try to extract it
+    # if the title has at least 2 words before a newline or period or open parens, strip everything after
+    # TODO could do better with knowledge of prepositions, names - maybe we just need a second model?
+    def normalize_title(hsh)
+      str = hsh['title'].strip
+      numeral_regexes = [
+        /^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,                                    # initial numbers + punctuation + space or a quote or a capital letter
+        /^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,    # initial roman numerals
+        /^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i                                      # initial single letter
+      ]
+      numeral_regexes.each do |regex|
+        if str.gsub!(regex, '')
+          break
+        end
+      end
+      if (m = str.match /^(["'”’´‘“`'])/)
+        quote_char = m[1]
+        pairable = pairable_quote_chars(quote_char)
+        if str.scan(/[#{pairable}]/).length >= 2
+          str.gsub!(/^#{quote_char}/, '')
+          str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
+        end
+      end
+      while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
+        i = str.rindex m[1]
+        str = str[0..i-1]
+      end
+      hsh['title'] = str
+      normalize('title',hsh)
+    end
+    def pairable_quote_chars(quote_char)
+      [%{"”“}, %{’'`‘´'}].each do |chars|
+        return chars if chars.include? quote_char
+      end
+    end
+    ##
+    # Tries to split the author tokens into individual author names
+    # and then normalizes these names individually.  Returns a
+    # list of author names.
+    ##
+    def normalize_author(hsh)
+      str = hsh['author']
+      tokens = repair_and_tokenize_author_text(str)
+      authors = []
+      current_auth = []
+      begin_auth = 1
+      tokens.each {|tok|
+        if tok =~ /^(&|and)$/i
+          if !current_auth.empty?
+            auth = normalize_author_name(current_auth)
+            authors << auth
+          end
+          current_auth = []
+          begin_auth = 1
+          next
+        end
+        if begin_auth > 0
+          current_auth << tok
+          begin_auth = 0
+          next
+        end
+        if tok =~ /,$/
+          current_auth << tok
+          if !current_auth.empty?
+            auth = normalize_author_name(current_auth)
+            authors << auth
+            current_auth = []
+            begin_auth = 1
+          end
+        else
+          current_auth << tok
+        end
+      }
+      if !current_auth.empty?
+        auth = normalize_author_name(current_auth)
+        authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
+      end
+      hsh['authors'] = authors if !authors.empty?
+      normalize('author',hsh)
+      hsh
+    end
+    def normalize_date(hsh)
+      str = hsh['date']
+      if str =~ /(\d{4})/
+        year = $1.to_i
+        current_year = Time.now.year
+        if year <= current_year+3
+          ret = year
+          hsh['year'] = ret
+        else
+          ret = nil
+        end
+      end
+      hsh['date'] = ret
+      hsh
+    end
+    def normalize_volume(hsh)
+      # If there are two numbers, they are volume and number.
+      # e.g. "23(2)", "Vol. 23, No. 3" etc...
+      if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
+        hsh['volume'] = $1
+        hsh['number'] = $2
+      # Otherwise, just pull out a number and hope that it's the volume
+      elsif hsh['volume'] =~ /(\d+)/
+        hsh['volume'] = $1
+      end
+      hsh
+    end
+    ##
+    # Normalizes page fields into the form "start--end".  If the page
+    # field does not appear to be in a standard form, does nothing.
+    ##
+    def normalize_pages(hsh)
+      # "vol.issue (year):pp"
+      case hsh['pages']
+      when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
+        hsh['volume'] = $1
+        hsh['number'] = $2 if $2
+        hsh['year'] = $3 if $3
+        hsh['pages'] = $4
+      end
+      case hsh['pages']
+      when  /(\d+)[^\d]+(\d+)/
+        hsh['pages'] = "#{$1}--#{$2}"
+      when  /(\d+)/
+        hsh['pages'] = $1
+      end
+      hsh
+    end
+    def repair_and_tokenize_author_text(author_text)
+      # Repair obvious parse errors and weird notations.
+      author_text.sub!(/et\.? al\.?.*$/, '')
+      # FIXME: maybe I'm mis-understanding Perl regular expressions, but
+      # this pattern from ParseCit appears to do the Wrong Thing:
+      # author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
+      author_text.gsub!(/\(.*?\)/, '')
+      author_text.gsub!(/^.*?\)\.?/, '')
+      author_text.gsub!(/\(.*?$/, '')
+      author_text.gsub!(/\[.*?\]/, '')
+      author_text.gsub!(/^.*?\]\.?/, '')
+      author_text.gsub!(/\[.*?$/, '')
+      author_text.gsub!(/;/, ',')
+      author_text.gsub!(/,/, ', ')
+      author_text.gsub!(/\:/, ' ')
+      author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
+      author_text = join_multi_word_names(author_text)
+      orig_tokens = author_text.split(/\s+/)
+      tokens = []
+      last = false
+      orig_tokens.each_with_index {|tok, i|
+        if tok !~ /[A-Za-z&]/
+          if i < orig_tokens.length/2
+            tokens = []
+            next
+          else
+            last = true
+          end
+        end
+        if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
+            tokens.last =~ /\,$/) or
+            tok =~ /^[IVX][IVX]+\.?\,?$/
+          next
+        end
+        tokens << tok
+        break if last
+      }
+      tokens
+    end # repair_and_tokenize_author_text
+    # Insert underscores to join name particles. i.e.
+    # Jon de Groote ---> Jon de_Groote
+    def join_multi_word_names(author_text)
+      author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
+        "#{$1}_"
+      }
+    end
+    ##
+    # Tries to normalize an individual author name into the form
+    # "First Middle Last", without punctuation.
+    ##
+    def normalize_author_name(auth_toks)
+      return '' if auth_toks.empty?
+      str = auth_toks.join(" ")
+      if str =~ /(.+),\s*(.+)/
+        str = "#{$1} #{$2}"
+      end
+      str.gsub!(/\.\-/, '-')
+      str.gsub!(/[\,\.]/, ' ')
+      str.gsub!(/  +/, ' ')
+      str.strip!
+      if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
+        new_toks = str.split(/\s+/)
+        new_order = new_toks[1...new_toks.length];
+        new_order << new_toks[0]
+        str = new_order.join(" ")
+      end
+      str.gsub!(/^[^A-Za-z0-9]+/, '')
+      str.gsub!(/[^A-Za-z0-9]+$/, '')
+      return str
+    end
+  end
+end

data/lib/excite/preprocessor.rb ADDED Viewed

@@ -0,0 +1,107 @@
+# encoding: UTF-8
+module Excite
+  module Preprocessor
+    MARKER_TYPES = {
+      :SQUARE       => '\\[.+?\\]',
+      :PAREN        => '\\(.+?\\)',
+      :NAKEDNUM     => '\\d+',
+      :NAKEDNUMDOT  => '\\d+\\.',
+    }
+    CLEANUP_RULES_FILE = "#{File.dirname(__FILE__)}/../../config/citation_cleanup_rules.yml"
+    def cleanup_rules
+      return @rules if @rules
+      raw = YAML.load_file CLEANUP_RULES_FILE
+      @rules = raw['order'].map do |rule_name|
+        re = Regexp.new(raw['rules'][rule_name]['regex'], raw['rules'][rule_name]['ignore_case'])
+        repl = raw['rules'][rule_name]['replacement_str'] || ''
+        { re: re, repl: repl }
+      end
+    end
+    ##
+    # Removes lines that appear to be junk from the citation text,
+    # and applies cleanup regexes from the configuration file.
+    ##
+    def normalize_cite_text(cite_text)
+      cite_text.split(/\n/).reject do |line|
+        line.blank? || line =~ /^[\s\d]*$/
+      end.map do |line|
+        normalize_citation(line)
+      end.join("\n")
+    end
+    def normalize_citation(cite)
+      cite = cite.dup
+      cleanup_rules.each do |rule|
+        cite.gsub!(rule[:re], rule[:repl])
+      end
+      cite
+    end
+    ##
+    # Controls the process by which citations are segmented,
+    # based on the result of trying to guess the type of
+    # citation marker used in the reference section.  Returns
+    # a reference to a list of citation objects.
+    ##
+    def segment_citations(cite_text)
+      marker_type = guess_marker_type(cite_text)
+      unless marker_type == 'UNKNOWN'
+        citations = split_unmarked_citations(cite_text)
+      else
+        citations = split_citations_by_marker(cite_text, marker_type)
+      end
+      return citations
+    end
+    ##
+    # Segments citations that have explicit markers in the
+    # reference section.  Whenever a new line starts with an
+    # expression that matches what we'd expect of a marker,
+    # a new citation is started.  Returns a reference to a
+    # list of citation objects.
+    ##
+    def split_citations_by_marker(cite_text, marker_type=nil)
+      citations = []
+      current_citation = Citation.new
+      current_citation_string = nil
+      cite_text.split(/\n/).each {|line|
+        if line =~ /^\s*(#{MARKER_TYPES{marker_type}})\s*(.*)$/
+          marker, cite_string = $1, $2
+          if current_citation_string
+            current_citation.citation_string = current_citation_string
+            citations << current_citation
+            current_citation_string = nil
+          end
+          current_citation = Citation.new
+          current_citation.marker_type = marker_type
+          current_citation.marker = marker
+          current_citation_string = cite_string
+        else
+          if current_citation_string =~ /\s\-$/
+            current_citation_string.sub(/\-$/, '')
+            current_citation_string << line
+          else
+            current_citation_string << " " << line
+          end
+        end
+      }
+      if current_citation && current_citation_string
+        current_citation.string = current_citation_string
+        citations << current_citation
+      end
+      citations
+    end
+  end
+end