excite 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
 - data/.rspec +1 -0
 - data/Gemfile +8 -0
 - data/Gemfile.lock +69 -0
 - data/LICENSE +22 -0
 - data/README.md +46 -0
 - data/Rakefile +24 -0
 - data/config/citation_cleanup_rules.yml +68 -0
 - data/config/parscit_features.yml +55 -0
 - data/excite.gemspec +30 -0
 - data/lib/excite/array_helpers.rb +27 -0
 - data/lib/excite/citation.rb +48 -0
 - data/lib/excite/crfparser.rb +322 -0
 - data/lib/excite/postprocessor.rb +252 -0
 - data/lib/excite/preprocessor.rb +107 -0
 - data/lib/excite/resources/dicts/female-names +4954 -0
 - data/lib/excite/resources/dicts/first-names +27926 -0
 - data/lib/excite/resources/dicts/male-names +3901 -0
 - data/lib/excite/resources/dicts/months +24 -0
 - data/lib/excite/resources/dicts/places +43109 -0
 - data/lib/excite/resources/dicts/publishers +654 -0
 - data/lib/excite/resources/dicts/surnames +146259 -0
 - data/lib/excite/resources/html.template +84 -0
 - data/lib/excite/resources/html_model +0 -0
 - data/lib/excite/resources/model +0 -0
 - data/lib/excite/resources/parsCit.template +76 -0
 - data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
 - data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
 - data/lib/excite/resources/trainingdata/verify.rb +97 -0
 - data/lib/excite/token_features.rb +313 -0
 - data/lib/excite/version.rb +7 -0
 - data/lib/excite.rb +13 -0
 - data/model/test/analysis.csv +54 -0
 - data/model/test/array_helpers.rb +30 -0
 - data/model/test/html-analysis.csv +60 -0
 - data/model/test/html-output.txt +19893 -0
 - data/model/test/model_test.rb +306 -0
 - data/model/test/output.txt +16742 -0
 - data/spec/excite/citation_spec.rb +128 -0
 - data/spec/excite/crfparser_spec.rb +118 -0
 - data/spec/excite/postprocessor_spec.rb +68 -0
 - data/spec/excite/token_features_spec.rb +641 -0
 - data/spec/spec_helper.rb +4 -0
 - metadata +222 -0
 
| 
         @@ -0,0 +1,322 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require 'tempfile'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'cgi'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'engtagger'
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            module Excite
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
              class CRFParser
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                attr_reader :feature_order
         
     | 
| 
      
 13 
     | 
    
         
            +
                attr_reader :token_features
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                include TokenFeatures
         
     | 
| 
      
 16 
     | 
    
         
            +
                include Preprocessor
         
     | 
| 
      
 17 
     | 
    
         
            +
                include Postprocessor
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                DIR = File.dirname(__FILE__)
         
     | 
| 
      
 20 
     | 
    
         
            +
                TAGGED_REFERENCES = "#{DIR}/resources/trainingdata/tagged_references.txt"
         
     | 
| 
      
 21 
     | 
    
         
            +
                TAGGED_HTML_REFERENCES = "#{DIR}/resources/trainingdata/tagged_html_references.txt"
         
     | 
| 
      
 22 
     | 
    
         
            +
                TRAINING_DATA = "#{DIR}/resources/trainingdata/training_data.txt"
         
     | 
| 
      
 23 
     | 
    
         
            +
                MODEL_FILE = "#{DIR}/resources/model"
         
     | 
| 
      
 24 
     | 
    
         
            +
                HTML_MODEL_FILE = "#{DIR}/resources/html_model"
         
     | 
| 
      
 25 
     | 
    
         
            +
                TEMPLATE_FILE = "#{DIR}/resources/parsCit.template"
         
     | 
| 
      
 26 
     | 
    
         
            +
                HTML_TEMPLATE_FILE = "#{DIR}/resources/html.template"
         
     | 
| 
      
 27 
     | 
    
         
            +
                CONFIG_FILE = "#{DIR}/../../config/parscit_features.yml"
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                # Feature functions must be performed in alphabetical order, since
         
     | 
| 
      
 30 
     | 
    
         
            +
                # later functions may depend on earlier ones.
         
     | 
| 
      
 31 
     | 
    
         
            +
                # TODO This seems pretty confusing and dependent on the current features.
         
     | 
| 
      
 32 
     | 
    
         
            +
                def initialize(mode=:string)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  @mode = mode
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                  f = File.open(CONFIG_FILE, 'r')
         
     | 
| 
      
 36 
     | 
    
         
            +
                  hsh = YAML::load(f)[mode.to_s]
         
     | 
| 
      
 37 
     | 
    
         
            +
                  @feature_order = hsh["feature_order"].map(&:to_sym)
         
     | 
| 
      
 38 
     | 
    
         
            +
                  @token_features = hsh["feature_order"].sort.map(&:to_sym)
         
     | 
| 
      
 39 
     | 
    
         
            +
                end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
                def model
         
     | 
| 
      
 42 
     | 
    
         
            +
                  @model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
         
     | 
| 
      
 43 
     | 
    
         
            +
                end
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                def parse(str, presumed_author=nil)
         
     | 
| 
      
 46 
     | 
    
         
            +
                  raw_string = str.dup
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                  toks, features = str_2_features(str, false, presumed_author)
         
     | 
| 
      
 49 
     | 
    
         
            +
                  tags, overall_prob, tag_probs = eval_crfpp(features, model)
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                  ret = {}
         
     | 
| 
      
 52 
     | 
    
         
            +
                  tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
         
     | 
| 
      
 53 
     | 
    
         
            +
                  ret.each { |k, v| ret[k] = v.join('').strip }
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                  normalize_fields(ret)
         
     | 
| 
      
 56 
     | 
    
         
            +
                  ret['raw_string'] = raw_string
         
     | 
| 
      
 57 
     | 
    
         
            +
                  [ret, overall_prob, tag_probs]
         
     | 
| 
      
 58 
     | 
    
         
            +
                end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                def eval_crfpp(feat_seq, model)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  model.clear
         
     | 
| 
      
 62 
     | 
    
         
            +
                  feat_seq.each {|vec|
         
     | 
| 
      
 63 
     | 
    
         
            +
                    line = vec.join(" ").strip
         
     | 
| 
      
 64 
     | 
    
         
            +
                    raise unless model.add(line)
         
     | 
| 
      
 65 
     | 
    
         
            +
                  }
         
     | 
| 
      
 66 
     | 
    
         
            +
                  raise unless model.parse
         
     | 
| 
      
 67 
     | 
    
         
            +
                  tags = []
         
     | 
| 
      
 68 
     | 
    
         
            +
                  probs = {}
         
     | 
| 
      
 69 
     | 
    
         
            +
                  feat_seq.length.times {|i|
         
     | 
| 
      
 70 
     | 
    
         
            +
                    tags << model.y2(i)
         
     | 
| 
      
 71 
     | 
    
         
            +
                    probs[model.y2(i)] ||= 1
         
     | 
| 
      
 72 
     | 
    
         
            +
                    probs[model.y2(i)] *= model.prob(i)
         
     | 
| 
      
 73 
     | 
    
         
            +
                  }
         
     | 
| 
      
 74 
     | 
    
         
            +
                  [tags, model.prob, probs]
         
     | 
| 
      
 75 
     | 
    
         
            +
                end
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                def self.strip_punct(str)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  toknp = str.gsub(/[^\w]/, '')
         
     | 
| 
      
 79 
     | 
    
         
            +
                  toknp = "EMPTY" if toknp.blank? # TODO Seems maybe hacky
         
     | 
| 
      
 80 
     | 
    
         
            +
                  toknp
         
     | 
| 
      
 81 
     | 
    
         
            +
                end
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
                def normalize_input_author(str)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  return nil if str.blank?
         
     | 
| 
      
 85 
     | 
    
         
            +
                  str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
         
     | 
| 
      
 86 
     | 
    
         
            +
                end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
                def prepare_token_data(raw_string, training=false)
         
     | 
| 
      
 89 
     | 
    
         
            +
                  if training
         
     | 
| 
      
 90 
     | 
    
         
            +
                    tags = tagged_string_2_tags(raw_string.strip)
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                    labels, raw_string, joined_tokens = [], '', ''
         
     | 
| 
      
 93 
     | 
    
         
            +
                    tags.each do |tag|
         
     | 
| 
      
 94 
     | 
    
         
            +
                      raw = CGI.unescapeHTML(tag.inner_html)
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
                      label = tag.name
         
     | 
| 
      
 97 
     | 
    
         
            +
                      raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)
         
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
                      toks = str_2_tokens(raw)
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                      labels << [label, joined_tokens.length]
         
     | 
| 
      
 102 
     | 
    
         
            +
                      joined_tokens += toks.map(&:raw).join
         
     | 
| 
      
 103 
     | 
    
         
            +
                      raw_string += "\n#{raw}"
         
     | 
| 
      
 104 
     | 
    
         
            +
                    end
         
     | 
| 
      
 105 
     | 
    
         
            +
                  end
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
                  tokens = str_2_tokens(raw_string.strip)
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                  if training
         
     | 
| 
      
 110 
     | 
    
         
            +
                    joined_tokens = ''
         
     | 
| 
      
 111 
     | 
    
         
            +
                    label, _ = labels.shift
         
     | 
| 
      
 112 
     | 
    
         
            +
                    next_label, end_idx = labels.shift unless labels.empty?
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
                    tokens.each do |tok|
         
     | 
| 
      
 115 
     | 
    
         
            +
                      tok.label = label
         
     | 
| 
      
 116 
     | 
    
         
            +
                      joined_tokens += tok.raw
         
     | 
| 
      
 117 
     | 
    
         
            +
                      if joined_tokens.length == end_idx
         
     | 
| 
      
 118 
     | 
    
         
            +
                        label = next_label
         
     | 
| 
      
 119 
     | 
    
         
            +
                        next_label, end_idx = labels.shift unless labels.empty?
         
     | 
| 
      
 120 
     | 
    
         
            +
                      elsif joined_tokens.length > end_idx && !labels.empty?
         
     | 
| 
      
 121 
     | 
    
         
            +
                        raise "Tokens do not match labels"
         
     | 
| 
      
 122 
     | 
    
         
            +
                      end
         
     | 
| 
      
 123 
     | 
    
         
            +
                    end
         
     | 
| 
      
 124 
     | 
    
         
            +
                    raise "Unused label" unless labels.empty?
         
     | 
| 
      
 125 
     | 
    
         
            +
                  end
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
                  self.clear
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
                  return tokens
         
     | 
| 
      
 130 
     | 
    
         
            +
                end
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
                def tagger
         
     | 
| 
      
 133 
     | 
    
         
            +
                  @tagger ||= EngTagger.new
         
     | 
| 
      
 134 
     | 
    
         
            +
                end
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                def tagged_string_2_tags(str)
         
     | 
| 
      
 137 
     | 
    
         
            +
                  str = "<string>#{str}</string>"
         
     | 
| 
      
 138 
     | 
    
         
            +
                  node = Nokogiri::XML.fragment(str).css('string')
         
     | 
| 
      
 139 
     | 
    
         
            +
                  node.children.reject(&:text?)
         
     | 
| 
      
 140 
     | 
    
         
            +
                end
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
                def str_2_tokens(str)
         
     | 
| 
      
 143 
     | 
    
         
            +
                  if @mode == :html
         
     | 
| 
      
 144 
     | 
    
         
            +
                    toks = html_str_2_tokens(str)
         
     | 
| 
      
 145 
     | 
    
         
            +
                  elsif @mode == :string
         
     | 
| 
      
 146 
     | 
    
         
            +
                    toks = text_str_2_tokens(str)
         
     | 
| 
      
 147 
     | 
    
         
            +
                  end
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
                  toks.reject { |t| t.empty? }
         
     | 
| 
      
 150 
     | 
    
         
            +
                end
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                def recognized_labels
         
     | 
| 
      
 153 
     | 
    
         
            +
                  if @mode == :string
         
     | 
| 
      
 154 
     | 
    
         
            +
                    ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
         
     | 
| 
      
 155 
     | 
    
         
            +
                  elsif @mode == :html
         
     | 
| 
      
 156 
     | 
    
         
            +
                    ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
         
     | 
| 
      
 157 
     | 
    
         
            +
                  else
         
     | 
| 
      
 158 
     | 
    
         
            +
                    []
         
     | 
| 
      
 159 
     | 
    
         
            +
                  end
         
     | 
| 
      
 160 
     | 
    
         
            +
                end
         
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
      
 162 
     | 
    
         
            +
                def html_str_2_tokens(str)
         
     | 
| 
      
 163 
     | 
    
         
            +
                  html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                  tokens = []
         
     | 
| 
      
 166 
     | 
    
         
            +
                  html.traverse do |node|
         
     | 
| 
      
 167 
     | 
    
         
            +
                    tokens += html_text_node_2_tokens(node) if node.text?
         
     | 
| 
      
 168 
     | 
    
         
            +
                  end
         
     | 
| 
      
 169 
     | 
    
         
            +
                  tokens
         
     | 
| 
      
 170 
     | 
    
         
            +
                end
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                def html_text_node_2_tokens(node)
         
     | 
| 
      
 173 
     | 
    
         
            +
                  text = CGI.unescapeHTML(node.text)
         
     | 
| 
      
 174 
     | 
    
         
            +
                  return [] if text.blank?
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
                  tokens = text_str_2_tokens(text)
         
     | 
| 
      
 177 
     | 
    
         
            +
                  tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
         
     | 
| 
      
 178 
     | 
    
         
            +
                  tokens
         
     | 
| 
      
 179 
     | 
    
         
            +
                end
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
                def text_str_2_tokens(text)
         
     | 
| 
      
 182 
     | 
    
         
            +
                  tagged = tagger.add_tags(normalize_citation(text))
         
     | 
| 
      
 183 
     | 
    
         
            +
                  tags = tagged_string_2_tags(tagged.gsub('&','&')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
         
     | 
| 
      
 184 
     | 
    
         
            +
                  tags.map { |tag| Token.new(tag.text, tag.name) }
         
     | 
| 
      
 185 
     | 
    
         
            +
                end
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
                # calculate features on the full citation string
         
     | 
| 
      
 188 
     | 
    
         
            +
                def str_2_features(raw_string, training=false, presumed_author=nil)
         
     | 
| 
      
 189 
     | 
    
         
            +
                  features = []
         
     | 
| 
      
 190 
     | 
    
         
            +
                  tokens = prepare_token_data(raw_string, training)
         
     | 
| 
      
 191 
     | 
    
         
            +
             
     | 
| 
      
 192 
     | 
    
         
            +
                  author_names = normalize_input_author(presumed_author)
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
                  tokens.each_with_index do |tok, toki|
         
     | 
| 
      
 195 
     | 
    
         
            +
                    raise "All tokens must be labeled" if training && tok.label.nil?
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
                    feats = {}
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
                    @token_features.each {|f|
         
     | 
| 
      
 200 
     | 
    
         
            +
                      feats[f] = self.send(f, tokens, toki, author_names)
         
     | 
| 
      
 201 
     | 
    
         
            +
                    }
         
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
      
 203 
     | 
    
         
            +
                    features << [tok.raw]
         
     | 
| 
      
 204 
     | 
    
         
            +
                    @feature_order.each {|f| features.last << feats[f]}
         
     | 
| 
      
 205 
     | 
    
         
            +
                    features.last << tok.label if training
         
     | 
| 
      
 206 
     | 
    
         
            +
                  end
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
                  [tokens, features]
         
     | 
| 
      
 209 
     | 
    
         
            +
                end
         
     | 
| 
      
 210 
     | 
    
         
            +
             
     | 
| 
      
 211 
     | 
    
         
            +
                def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
         
     | 
| 
      
 212 
     | 
    
         
            +
                  tagged_refs ||= default_tagged_references
         
     | 
| 
      
 213 
     | 
    
         
            +
             
     | 
| 
      
 214 
     | 
    
         
            +
                  fin = File.open(tagged_refs, 'r')
         
     | 
| 
      
 215 
     | 
    
         
            +
                  fout = File.open(training_data, 'w')
         
     | 
| 
      
 216 
     | 
    
         
            +
                  x = 0
         
     | 
| 
      
 217 
     | 
    
         
            +
                  while l = fin.gets
         
     | 
| 
      
 218 
     | 
    
         
            +
                    _, data = str_2_features(l.strip, true)
         
     | 
| 
      
 219 
     | 
    
         
            +
                    data.each {|line| fout.write("#{line.join(" ")}\n") }
         
     | 
| 
      
 220 
     | 
    
         
            +
                    fout.write("\n")
         
     | 
| 
      
 221 
     | 
    
         
            +
                  end
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
                  fin.close
         
     | 
| 
      
 224 
     | 
    
         
            +
                  fout.flush
         
     | 
| 
      
 225 
     | 
    
         
            +
                  fout.close
         
     | 
| 
      
 226 
     | 
    
         
            +
                end
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
                def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
         
     | 
| 
      
 229 
     | 
    
         
            +
                  tagged_refs ||= default_tagged_references
         
     | 
| 
      
 230 
     | 
    
         
            +
                  model ||= default_model_file
         
     | 
| 
      
 231 
     | 
    
         
            +
                  template ||= default_template_file
         
     | 
| 
      
 232 
     | 
    
         
            +
             
     | 
| 
      
 233 
     | 
    
         
            +
                  if training_data.nil?
         
     | 
| 
      
 234 
     | 
    
         
            +
                    training_data = TRAINING_DATA
         
     | 
| 
      
 235 
     | 
    
         
            +
                    write_training_file(tagged_refs, training_data)
         
     | 
| 
      
 236 
     | 
    
         
            +
                  end
         
     | 
| 
      
 237 
     | 
    
         
            +
             
     | 
| 
      
 238 
     | 
    
         
            +
                  `crf_learn #{template} #{training_data} #{model} -f3 1>&2`
         
     | 
| 
      
 239 
     | 
    
         
            +
                end
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
                def default_tagged_references
         
     | 
| 
      
 242 
     | 
    
         
            +
                  if @mode == :string
         
     | 
| 
      
 243 
     | 
    
         
            +
                    TAGGED_REFERENCES
         
     | 
| 
      
 244 
     | 
    
         
            +
                  elsif @mode == :html
         
     | 
| 
      
 245 
     | 
    
         
            +
                    TAGGED_HTML_REFERENCES
         
     | 
| 
      
 246 
     | 
    
         
            +
                  else
         
     | 
| 
      
 247 
     | 
    
         
            +
                    raise "Unknown mode: #{@mode}"
         
     | 
| 
      
 248 
     | 
    
         
            +
                  end
         
     | 
| 
      
 249 
     | 
    
         
            +
                end
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
                def default_model_file
         
     | 
| 
      
 252 
     | 
    
         
            +
                  if @mode == :string
         
     | 
| 
      
 253 
     | 
    
         
            +
                    MODEL_FILE
         
     | 
| 
      
 254 
     | 
    
         
            +
                  elsif @mode == :html
         
     | 
| 
      
 255 
     | 
    
         
            +
                    HTML_MODEL_FILE
         
     | 
| 
      
 256 
     | 
    
         
            +
                  else
         
     | 
| 
      
 257 
     | 
    
         
            +
                    raise "Unknown mode: #{@mode}"
         
     | 
| 
      
 258 
     | 
    
         
            +
                  end
         
     | 
| 
      
 259 
     | 
    
         
            +
                end
         
     | 
| 
      
 260 
     | 
    
         
            +
             
     | 
| 
      
 261 
     | 
    
         
            +
                def default_template_file
         
     | 
| 
      
 262 
     | 
    
         
            +
                  if @mode == :string
         
     | 
| 
      
 263 
     | 
    
         
            +
                    TEMPLATE_FILE
         
     | 
| 
      
 264 
     | 
    
         
            +
                  elsif @mode == :html
         
     | 
| 
      
 265 
     | 
    
         
            +
                    HTML_TEMPLATE_FILE
         
     | 
| 
      
 266 
     | 
    
         
            +
                  else
         
     | 
| 
      
 267 
     | 
    
         
            +
                    raise "Unknown mode: #{@mode}"
         
     | 
| 
      
 268 
     | 
    
         
            +
                  end
         
     | 
| 
      
 269 
     | 
    
         
            +
                end
         
     | 
| 
      
 270 
     | 
    
         
            +
             
     | 
| 
      
 271 
     | 
    
         
            +
              end
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
              class TrainingError < Exception; end
         
     | 
| 
      
 274 
     | 
    
         
            +
             
     | 
| 
      
 275 
     | 
    
         
            +
              class Token
         
     | 
| 
      
 276 
     | 
    
         
            +
             
     | 
| 
      
 277 
     | 
    
         
            +
                attr_reader :node, :idx_in_node, :node_token_count, :part_of_speech
         
     | 
| 
      
 278 
     | 
    
         
            +
                attr_accessor :label
         
     | 
| 
      
 279 
     | 
    
         
            +
             
     | 
| 
      
 280 
     | 
    
         
            +
                def initialize(str, part_of_speech=nil)
         
     | 
| 
      
 281 
     | 
    
         
            +
                  @str = str
         
     | 
| 
      
 282 
     | 
    
         
            +
                  @part_of_speech = part_of_speech
         
     | 
| 
      
 283 
     | 
    
         
            +
                end
         
     | 
| 
      
 284 
     | 
    
         
            +
             
     | 
| 
      
 285 
     | 
    
         
            +
                def is_in_node!(node, idx_in_node, node_token_count)
         
     | 
| 
      
 286 
     | 
    
         
            +
                  @node = node
         
     | 
| 
      
 287 
     | 
    
         
            +
                  @idx_in_node = idx_in_node
         
     | 
| 
      
 288 
     | 
    
         
            +
                  @node_token_count = node_token_count
         
     | 
| 
      
 289 
     | 
    
         
            +
                end
         
     | 
| 
      
 290 
     | 
    
         
            +
             
     | 
| 
      
 291 
     | 
    
         
            +
                def raw
         
     | 
| 
      
 292 
     | 
    
         
            +
                  @str
         
     | 
| 
      
 293 
     | 
    
         
            +
                end
         
     | 
| 
      
 294 
     | 
    
         
            +
             
     | 
| 
      
 295 
     | 
    
         
            +
                def np
         
     | 
| 
      
 296 
     | 
    
         
            +
                  @np ||= CRFParser.strip_punct(@str)
         
     | 
| 
      
 297 
     | 
    
         
            +
                end
         
     | 
| 
      
 298 
     | 
    
         
            +
             
     | 
| 
      
 299 
     | 
    
         
            +
                def lcnp
         
     | 
| 
      
 300 
     | 
    
         
            +
                  @lcnp ||= np == "EMPTY" ? np : np.downcase
         
     | 
| 
      
 301 
     | 
    
         
            +
                end
         
     | 
| 
      
 302 
     | 
    
         
            +
             
     | 
| 
      
 303 
     | 
    
         
            +
                def empty?
         
     | 
| 
      
 304 
     | 
    
         
            +
                  raw.strip.blank?
         
     | 
| 
      
 305 
     | 
    
         
            +
                end
         
     | 
| 
      
 306 
     | 
    
         
            +
             
     | 
| 
      
 307 
     | 
    
         
            +
                def to_s
         
     | 
| 
      
 308 
     | 
    
         
            +
                  "{#{raw}}"
         
     | 
| 
      
 309 
     | 
    
         
            +
                end
         
     | 
| 
      
 310 
     | 
    
         
            +
             
     | 
| 
      
 311 
     | 
    
         
            +
                def for_join(prev)
         
     | 
| 
      
 312 
     | 
    
         
            +
                  if ['pp','ppc','ppr','pps','rrb', 'pos'].include?(part_of_speech)
         
     | 
| 
      
 313 
     | 
    
         
            +
                    raw
         
     | 
| 
      
 314 
     | 
    
         
            +
                  elsif prev && ['ppd','ppl','lrb'].include?(prev.part_of_speech)
         
     | 
| 
      
 315 
     | 
    
         
            +
                    raw
         
     | 
| 
      
 316 
     | 
    
         
            +
                  else
         
     | 
| 
      
 317 
     | 
    
         
            +
                    " "+raw
         
     | 
| 
      
 318 
     | 
    
         
            +
                  end
         
     | 
| 
      
 319 
     | 
    
         
            +
                end
         
     | 
| 
      
 320 
     | 
    
         
            +
              end
         
     | 
| 
      
 321 
     | 
    
         
            +
             
     | 
| 
      
 322 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,252 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Excite
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              module Postprocessor
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                def normalize_fields(citation_hsh)
         
     | 
| 
      
 8 
     | 
    
         
            +
                  citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
         
     | 
| 
      
 9 
     | 
    
         
            +
                  citation_hsh
         
     | 
| 
      
 10 
     | 
    
         
            +
                end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                def method_missing(m, *args, &block)
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # Call normalize on any fields that don't have their own normalization
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # method defined
         
     | 
| 
      
 15 
     | 
    
         
            +
                  if m.to_s =~ /^normalize/
         
     | 
| 
      
 16 
     | 
    
         
            +
                    m.to_s =~ /normalize_(.*)$/
         
     | 
| 
      
 17 
     | 
    
         
            +
                    normalize($1, *args)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  else super
         
     | 
| 
      
 19 
     | 
    
         
            +
                  end
         
     | 
| 
      
 20 
     | 
    
         
            +
                end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                # default normalization function for all fields that do not have their
         
     | 
| 
      
 23 
     | 
    
         
            +
                # own normalization
         
     | 
| 
      
 24 
     | 
    
         
            +
                # Strip any leading and/or trailing punctuation and space
         
     | 
| 
      
 25 
     | 
    
         
            +
                def normalize(key, hsh)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  hsh[key].gsub!(/^[^A-Za-z0-9]+/, '')
         
     | 
| 
      
 27 
     | 
    
         
            +
                  hsh[key].gsub!(/[^A-Za-z0-9]+$/, '')
         
     | 
| 
      
 28 
     | 
    
         
            +
                end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                # strip leading numerals
         
     | 
| 
      
 31 
     | 
    
         
            +
                # if the real title is quoted inside this string, try to extract it
         
     | 
| 
      
 32 
     | 
    
         
            +
                # if the title has at least 2 words before a newline or period or open parens, strip everything after
         
     | 
| 
      
 33 
     | 
    
         
            +
                # TODO could do better with knowledge of prepositions, names - maybe we just need a second model?
         
     | 
| 
      
 34 
     | 
    
         
            +
                def normalize_title(hsh)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  str = hsh['title'].strip
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                  numeral_regexes = [
         
     | 
| 
      
 38 
     | 
    
         
            +
                    /^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,                                    # initial numbers + punctuation + space or a quote or a capital letter
         
     | 
| 
      
 39 
     | 
    
         
            +
                    /^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,    # initial roman numerals
         
     | 
| 
      
 40 
     | 
    
         
            +
                    /^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i                                      # initial single letter
         
     | 
| 
      
 41 
     | 
    
         
            +
                  ]
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                  numeral_regexes.each do |regex|
         
     | 
| 
      
 44 
     | 
    
         
            +
                    if str.gsub!(regex, '')
         
     | 
| 
      
 45 
     | 
    
         
            +
                      break
         
     | 
| 
      
 46 
     | 
    
         
            +
                    end
         
     | 
| 
      
 47 
     | 
    
         
            +
                  end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                  if (m = str.match /^(["'”’´‘“`'])/)
         
     | 
| 
      
 50 
     | 
    
         
            +
                    quote_char = m[1]
         
     | 
| 
      
 51 
     | 
    
         
            +
                    pairable = pairable_quote_chars(quote_char)
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                    if str.scan(/[#{pairable}]/).length >= 2
         
     | 
| 
      
 54 
     | 
    
         
            +
                      str.gsub!(/^#{quote_char}/, '')
         
     | 
| 
      
 55 
     | 
    
         
            +
                      str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
         
     | 
| 
      
 56 
     | 
    
         
            +
                    end
         
     | 
| 
      
 57 
     | 
    
         
            +
                  end
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                  while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
         
     | 
| 
      
 60 
     | 
    
         
            +
                    i = str.rindex m[1]
         
     | 
| 
      
 61 
     | 
    
         
            +
                    str = str[0..i-1]
         
     | 
| 
      
 62 
     | 
    
         
            +
                  end
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
                  hsh['title'] = str
         
     | 
| 
      
 65 
     | 
    
         
            +
                  normalize('title',hsh)
         
     | 
| 
      
 66 
     | 
    
         
            +
                end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                def pairable_quote_chars(quote_char)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  [%{"”“}, %{’'`‘´'}].each do |chars|
         
     | 
| 
      
 70 
     | 
    
         
            +
                    return chars if chars.include? quote_char
         
     | 
| 
      
 71 
     | 
    
         
            +
                  end
         
     | 
| 
      
 72 
     | 
    
         
            +
                end
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                ##
         
     | 
| 
      
 75 
     | 
    
         
            +
                # Tries to split the author tokens into individual author names
         
     | 
| 
      
 76 
     | 
    
         
            +
                # and then normalizes these names individually.  Returns a
         
     | 
| 
      
 77 
     | 
    
         
            +
                # list of author names.
         
     | 
| 
      
 78 
     | 
    
         
            +
                ##
         
     | 
| 
      
 79 
     | 
    
         
            +
                def normalize_author(hsh)
         
     | 
| 
      
 80 
     | 
    
         
            +
                  str = hsh['author']
         
     | 
| 
      
 81 
     | 
    
         
            +
                  tokens = repair_and_tokenize_author_text(str)
         
     | 
| 
      
 82 
     | 
    
         
            +
                  authors = []
         
     | 
| 
      
 83 
     | 
    
         
            +
                  current_auth = []
         
     | 
| 
      
 84 
     | 
    
         
            +
                  begin_auth = 1
         
     | 
| 
      
 85 
     | 
    
         
            +
                  tokens.each {|tok|
         
     | 
| 
      
 86 
     | 
    
         
            +
                    if tok =~ /^(&|and)$/i
         
     | 
| 
      
 87 
     | 
    
         
            +
                      if !current_auth.empty?
         
     | 
| 
      
 88 
     | 
    
         
            +
                        auth = normalize_author_name(current_auth)
         
     | 
| 
      
 89 
     | 
    
         
            +
                        authors << auth
         
     | 
| 
      
 90 
     | 
    
         
            +
                      end
         
     | 
| 
      
 91 
     | 
    
         
            +
                      current_auth = []
         
     | 
| 
      
 92 
     | 
    
         
            +
                      begin_auth = 1
         
     | 
| 
      
 93 
     | 
    
         
            +
                      next
         
     | 
| 
      
 94 
     | 
    
         
            +
                    end
         
     | 
| 
      
 95 
     | 
    
         
            +
                    if begin_auth > 0
         
     | 
| 
      
 96 
     | 
    
         
            +
                      current_auth << tok
         
     | 
| 
      
 97 
     | 
    
         
            +
                      begin_auth = 0
         
     | 
| 
      
 98 
     | 
    
         
            +
                      next
         
     | 
| 
      
 99 
     | 
    
         
            +
                    end
         
     | 
| 
      
 100 
     | 
    
         
            +
                    if tok =~ /,$/
         
     | 
| 
      
 101 
     | 
    
         
            +
                      current_auth << tok
         
     | 
| 
      
 102 
     | 
    
         
            +
                      if !current_auth.empty?
         
     | 
| 
      
 103 
     | 
    
         
            +
                        auth = normalize_author_name(current_auth)
         
     | 
| 
      
 104 
     | 
    
         
            +
                        authors << auth
         
     | 
| 
      
 105 
     | 
    
         
            +
                        current_auth = []
         
     | 
| 
      
 106 
     | 
    
         
            +
                        begin_auth = 1
         
     | 
| 
      
 107 
     | 
    
         
            +
                      end
         
     | 
| 
      
 108 
     | 
    
         
            +
                    else
         
     | 
| 
      
 109 
     | 
    
         
            +
                      current_auth << tok
         
     | 
| 
      
 110 
     | 
    
         
            +
                    end
         
     | 
| 
      
 111 
     | 
    
         
            +
                  }
         
     | 
| 
      
 112 
     | 
    
         
            +
                  if !current_auth.empty?
         
     | 
| 
      
 113 
     | 
    
         
            +
                    auth = normalize_author_name(current_auth)
         
     | 
| 
      
 114 
     | 
    
         
            +
                    authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
         
     | 
| 
      
 115 
     | 
    
         
            +
                  end
         
     | 
| 
      
 116 
     | 
    
         
            +
                  hsh['authors'] = authors if !authors.empty?
         
     | 
| 
      
 117 
     | 
    
         
            +
                  normalize('author',hsh)
         
     | 
| 
      
 118 
     | 
    
         
            +
                  hsh
         
     | 
| 
      
 119 
     | 
    
         
            +
                end
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
                def normalize_date(hsh)
         
     | 
| 
      
 122 
     | 
    
         
            +
                  str = hsh['date']
         
     | 
| 
      
 123 
     | 
    
         
            +
                  if str =~ /(\d{4})/
         
     | 
| 
      
 124 
     | 
    
         
            +
                    year = $1.to_i
         
     | 
| 
      
 125 
     | 
    
         
            +
                    current_year = Time.now.year
         
     | 
| 
      
 126 
     | 
    
         
            +
                    if year <= current_year+3
         
     | 
| 
      
 127 
     | 
    
         
            +
                      ret = year
         
     | 
| 
      
 128 
     | 
    
         
            +
                      hsh['year'] = ret
         
     | 
| 
      
 129 
     | 
    
         
            +
                    else
         
     | 
| 
      
 130 
     | 
    
         
            +
                      ret = nil
         
     | 
| 
      
 131 
     | 
    
         
            +
                    end
         
     | 
| 
      
 132 
     | 
    
         
            +
                  end
         
     | 
| 
      
 133 
     | 
    
         
            +
                  hsh['date'] = ret
         
     | 
| 
      
 134 
     | 
    
         
            +
                  hsh
         
     | 
| 
      
 135 
     | 
    
         
            +
                end
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
                def normalize_volume(hsh)
         
     | 
| 
      
 138 
     | 
    
         
            +
                  # If there are two numbers, they are volume and number.
         
     | 
| 
      
 139 
     | 
    
         
            +
                  # e.g. "23(2)", "Vol. 23, No. 3" etc...
         
     | 
| 
      
 140 
     | 
    
         
            +
                  if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
         
     | 
| 
      
 141 
     | 
    
         
            +
                    hsh['volume'] = $1
         
     | 
| 
      
 142 
     | 
    
         
            +
                    hsh['number'] = $2
         
     | 
| 
      
 143 
     | 
    
         
            +
                  # Otherwise, just pull out a number and hope that it's the volume
         
     | 
| 
      
 144 
     | 
    
         
            +
                  elsif hsh['volume'] =~ /(\d+)/
         
     | 
| 
      
 145 
     | 
    
         
            +
                    hsh['volume'] = $1
         
     | 
| 
      
 146 
     | 
    
         
            +
                  end
         
     | 
| 
      
 147 
     | 
    
         
            +
                  hsh
         
     | 
| 
      
 148 
     | 
    
         
            +
                end
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
                ##
         
     | 
| 
      
 151 
     | 
    
         
            +
                # Normalizes page fields into the form "start--end".  If the page
         
     | 
| 
      
 152 
     | 
    
         
            +
                # field does not appear to be in a standard form, does nothing.
         
     | 
| 
      
 153 
     | 
    
         
            +
                ##
         
     | 
| 
      
 154 
     | 
    
         
            +
                def normalize_pages(hsh)
         
     | 
| 
      
 155 
     | 
    
         
            +
                  # "vol.issue (year):pp"
         
     | 
| 
      
 156 
     | 
    
         
            +
                  case hsh['pages']
         
     | 
| 
      
 157 
     | 
    
         
            +
                  when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
         
     | 
| 
      
 158 
     | 
    
         
            +
                    hsh['volume'] = $1
         
     | 
| 
      
 159 
     | 
    
         
            +
                    hsh['number'] = $2 if $2
         
     | 
| 
      
 160 
     | 
    
         
            +
                    hsh['year'] = $3 if $3
         
     | 
| 
      
 161 
     | 
    
         
            +
                    hsh['pages'] = $4
         
     | 
| 
      
 162 
     | 
    
         
            +
                  end
         
     | 
| 
      
 163 
     | 
    
         
            +
             
     | 
| 
      
 164 
     | 
    
         
            +
                  case hsh['pages']
         
     | 
| 
      
 165 
     | 
    
         
            +
                  when  /(\d+)[^\d]+(\d+)/
         
     | 
| 
      
 166 
     | 
    
         
            +
                    hsh['pages'] = "#{$1}--#{$2}"
         
     | 
| 
      
 167 
     | 
    
         
            +
                  when  /(\d+)/
         
     | 
| 
      
 168 
     | 
    
         
            +
                    hsh['pages'] = $1
         
     | 
| 
      
 169 
     | 
    
         
            +
                  end
         
     | 
| 
      
 170 
     | 
    
         
            +
                  hsh
         
     | 
| 
      
 171 
     | 
    
         
            +
                end
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
                def repair_and_tokenize_author_text(author_text)
         
     | 
| 
      
 174 
     | 
    
         
            +
                  # Repair obvious parse errors and weird notations.
         
     | 
| 
      
 175 
     | 
    
         
            +
                  author_text.sub!(/et\.? al\.?.*$/, '')
         
     | 
| 
      
 176 
     | 
    
         
            +
                  # FIXME: maybe I'm mis-understanding Perl regular expressions, but
         
     | 
| 
      
 177 
     | 
    
         
            +
                  # this pattern from ParseCit appears to do the Wrong Thing:
         
     | 
| 
      
 178 
     | 
    
         
            +
                  # author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
         
     | 
| 
      
 179 
     | 
    
         
            +
                  author_text.gsub!(/\(.*?\)/, '')
         
     | 
| 
      
 180 
     | 
    
         
            +
                  author_text.gsub!(/^.*?\)\.?/, '')
         
     | 
| 
      
 181 
     | 
    
         
            +
                  author_text.gsub!(/\(.*?$/, '')
         
     | 
| 
      
 182 
     | 
    
         
            +
                  author_text.gsub!(/\[.*?\]/, '')
         
     | 
| 
      
 183 
     | 
    
         
            +
                  author_text.gsub!(/^.*?\]\.?/, '')
         
     | 
| 
      
 184 
     | 
    
         
            +
                  author_text.gsub!(/\[.*?$/, '')
         
     | 
| 
      
 185 
     | 
    
         
            +
                  author_text.gsub!(/;/, ',')
         
     | 
| 
      
 186 
     | 
    
         
            +
                  author_text.gsub!(/,/, ', ')
         
     | 
| 
      
 187 
     | 
    
         
            +
                  author_text.gsub!(/\:/, ' ')
         
     | 
| 
      
 188 
     | 
    
         
            +
                  author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
         
     | 
| 
      
 189 
     | 
    
         
            +
                  author_text = join_multi_word_names(author_text)
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
                  orig_tokens = author_text.split(/\s+/)
         
     | 
| 
      
 192 
     | 
    
         
            +
                  tokens = []
         
     | 
| 
      
 193 
     | 
    
         
            +
                  last = false
         
     | 
| 
      
 194 
     | 
    
         
            +
                  orig_tokens.each_with_index {|tok, i|
         
     | 
| 
      
 195 
     | 
    
         
            +
                    if tok !~ /[A-Za-z&]/
         
     | 
| 
      
 196 
     | 
    
         
            +
                      if i < orig_tokens.length/2
         
     | 
| 
      
 197 
     | 
    
         
            +
                        tokens = []
         
     | 
| 
      
 198 
     | 
    
         
            +
                        next
         
     | 
| 
      
 199 
     | 
    
         
            +
                      else
         
     | 
| 
      
 200 
     | 
    
         
            +
                        last = true
         
     | 
| 
      
 201 
     | 
    
         
            +
                      end
         
     | 
| 
      
 202 
     | 
    
         
            +
                    end
         
     | 
| 
      
 203 
     | 
    
         
            +
                    if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
         
     | 
| 
      
 204 
     | 
    
         
            +
                        tokens.last =~ /\,$/) or
         
     | 
| 
      
 205 
     | 
    
         
            +
                        tok =~ /^[IVX][IVX]+\.?\,?$/
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
      
 207 
     | 
    
         
            +
                      next
         
     | 
| 
      
 208 
     | 
    
         
            +
                    end
         
     | 
| 
      
 209 
     | 
    
         
            +
                    tokens << tok
         
     | 
| 
      
 210 
     | 
    
         
            +
                    break if last
         
     | 
| 
      
 211 
     | 
    
         
            +
                  }
         
     | 
| 
      
 212 
     | 
    
         
            +
                  tokens
         
     | 
| 
      
 213 
     | 
    
         
            +
                end # repair_and_tokenize_author_text
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
                # Insert underscores to join name particles. i.e.
         
     | 
| 
      
 216 
     | 
    
         
            +
                # Jon de Groote ---> Jon de_Groote
         
     | 
| 
      
 217 
     | 
    
         
            +
                def join_multi_word_names(author_text)
         
     | 
| 
      
 218 
     | 
    
         
            +
                  author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
         
     | 
| 
      
 219 
     | 
    
         
            +
                    "#{$1}_"
         
     | 
| 
      
 220 
     | 
    
         
            +
                  }
         
     | 
| 
      
 221 
     | 
    
         
            +
                end
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
                ##
         
     | 
| 
      
 224 
     | 
    
         
            +
                # Tries to normalize an individual author name into the form
         
     | 
| 
      
 225 
     | 
    
         
            +
                # "First Middle Last", without punctuation.
         
     | 
| 
      
 226 
     | 
    
         
            +
                ##
         
     | 
| 
      
 227 
     | 
    
         
            +
                def normalize_author_name(auth_toks)
         
     | 
| 
      
 228 
     | 
    
         
            +
                  return '' if auth_toks.empty?
         
     | 
| 
      
 229 
     | 
    
         
            +
                  str = auth_toks.join(" ")
         
     | 
| 
      
 230 
     | 
    
         
            +
                  if str =~ /(.+),\s*(.+)/
         
     | 
| 
      
 231 
     | 
    
         
            +
                    str = "#{$1} #{$2}"
         
     | 
| 
      
 232 
     | 
    
         
            +
                  end
         
     | 
| 
      
 233 
     | 
    
         
            +
                  str.gsub!(/\.\-/, '-')
         
     | 
| 
      
 234 
     | 
    
         
            +
                  str.gsub!(/[\,\.]/, ' ')
         
     | 
| 
      
 235 
     | 
    
         
            +
                  str.gsub!(/  +/, ' ')
         
     | 
| 
      
 236 
     | 
    
         
            +
                  str.strip!
         
     | 
| 
      
 237 
     | 
    
         
            +
             
     | 
| 
      
 238 
     | 
    
         
            +
                  if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
         
     | 
| 
      
 239 
     | 
    
         
            +
                    new_toks = str.split(/\s+/)
         
     | 
| 
      
 240 
     | 
    
         
            +
                    new_order = new_toks[1...new_toks.length];
         
     | 
| 
      
 241 
     | 
    
         
            +
                    new_order << new_toks[0]
         
     | 
| 
      
 242 
     | 
    
         
            +
                    str = new_order.join(" ")
         
     | 
| 
      
 243 
     | 
    
         
            +
                  end
         
     | 
| 
      
 244 
     | 
    
         
            +
             
     | 
| 
      
 245 
     | 
    
         
            +
                  str.gsub!(/^[^A-Za-z0-9]+/, '')
         
     | 
| 
      
 246 
     | 
    
         
            +
                  str.gsub!(/[^A-Za-z0-9]+$/, '')
         
     | 
| 
      
 247 
     | 
    
         
            +
                  return str
         
     | 
| 
      
 248 
     | 
    
         
            +
                end
         
     | 
| 
      
 249 
     | 
    
         
            +
             
     | 
| 
      
 250 
     | 
    
         
            +
              end
         
     | 
| 
      
 251 
     | 
    
         
            +
             
     | 
| 
      
 252 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,107 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Excite
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              module Preprocessor
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                MARKER_TYPES = {
         
     | 
| 
      
 8 
     | 
    
         
            +
                  :SQUARE       => '\\[.+?\\]',
         
     | 
| 
      
 9 
     | 
    
         
            +
                  :PAREN        => '\\(.+?\\)',
         
     | 
| 
      
 10 
     | 
    
         
            +
                  :NAKEDNUM     => '\\d+',
         
     | 
| 
      
 11 
     | 
    
         
            +
                  :NAKEDNUMDOT  => '\\d+\\.',
         
     | 
| 
      
 12 
     | 
    
         
            +
                }
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                CLEANUP_RULES_FILE = "#{File.dirname(__FILE__)}/../../config/citation_cleanup_rules.yml"
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                def cleanup_rules
         
     | 
| 
      
 17 
     | 
    
         
            +
                  return @rules if @rules
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  raw = YAML.load_file CLEANUP_RULES_FILE
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @rules = raw['order'].map do |rule_name|
         
     | 
| 
      
 21 
     | 
    
         
            +
                    re = Regexp.new(raw['rules'][rule_name]['regex'], raw['rules'][rule_name]['ignore_case'])
         
     | 
| 
      
 22 
     | 
    
         
            +
                    repl = raw['rules'][rule_name]['replacement_str'] || ''
         
     | 
| 
      
 23 
     | 
    
         
            +
                    { re: re, repl: repl }
         
     | 
| 
      
 24 
     | 
    
         
            +
                  end
         
     | 
| 
      
 25 
     | 
    
         
            +
                end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                ##
         
     | 
| 
      
 28 
     | 
    
         
            +
                # Removes lines that appear to be junk from the citation text,
         
     | 
| 
      
 29 
     | 
    
         
            +
                # and applies cleanup regexes from the configuration file.
         
     | 
| 
      
 30 
     | 
    
         
            +
                ##
         
     | 
| 
      
 31 
     | 
    
         
            +
                def normalize_cite_text(cite_text)
         
     | 
| 
      
 32 
     | 
    
         
            +
                  cite_text.split(/\n/).reject do |line|
         
     | 
| 
      
 33 
     | 
    
         
            +
                    line.blank? || line =~ /^[\s\d]*$/
         
     | 
| 
      
 34 
     | 
    
         
            +
                  end.map do |line|
         
     | 
| 
      
 35 
     | 
    
         
            +
                    normalize_citation(line)
         
     | 
| 
      
 36 
     | 
    
         
            +
                  end.join("\n")
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                def normalize_citation(cite)
         
     | 
| 
      
 40 
     | 
    
         
            +
                  cite = cite.dup
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                  cleanup_rules.each do |rule|
         
     | 
| 
      
 43 
     | 
    
         
            +
                    cite.gsub!(rule[:re], rule[:repl])
         
     | 
| 
      
 44 
     | 
    
         
            +
                  end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                  cite
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                ##
         
     | 
| 
      
 50 
     | 
    
         
            +
                # Controls the process by which citations are segmented,
         
     | 
| 
      
 51 
     | 
    
         
            +
                # based on the result of trying to guess the type of
         
     | 
| 
      
 52 
     | 
    
         
            +
                # citation marker used in the reference section.  Returns
         
     | 
| 
      
 53 
     | 
    
         
            +
                # a reference to a list of citation objects.
         
     | 
| 
      
 54 
     | 
    
         
            +
                ##
         
     | 
| 
      
 55 
     | 
    
         
            +
                def segment_citations(cite_text)
         
     | 
| 
      
 56 
     | 
    
         
            +
                  marker_type = guess_marker_type(cite_text)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  unless marker_type == 'UNKNOWN'
         
     | 
| 
      
 58 
     | 
    
         
            +
                    citations = split_unmarked_citations(cite_text)
         
     | 
| 
      
 59 
     | 
    
         
            +
                  else
         
     | 
| 
      
 60 
     | 
    
         
            +
                    citations = split_citations_by_marker(cite_text, marker_type)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
                  return citations
         
     | 
| 
      
 63 
     | 
    
         
            +
                end
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                ##
         
     | 
| 
      
 66 
     | 
    
         
            +
                # Segments citations that have explicit markers in the
         
     | 
| 
      
 67 
     | 
    
         
            +
                # reference section.  Whenever a new line starts with an
         
     | 
| 
      
 68 
     | 
    
         
            +
                # expression that matches what we'd expect of a marker,
         
     | 
| 
      
 69 
     | 
    
         
            +
                # a new citation is started.  Returns a reference to a
         
     | 
| 
      
 70 
     | 
    
         
            +
                # list of citation objects.
         
     | 
| 
      
 71 
     | 
    
         
            +
                ##
         
     | 
| 
      
 72 
     | 
    
         
            +
                def split_citations_by_marker(cite_text, marker_type=nil)
         
     | 
| 
      
 73 
     | 
    
         
            +
                  citations = []
         
     | 
| 
      
 74 
     | 
    
         
            +
                  current_citation = Citation.new
         
     | 
| 
      
 75 
     | 
    
         
            +
                  current_citation_string = nil
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                  cite_text.split(/\n/).each {|line|
         
     | 
| 
      
 78 
     | 
    
         
            +
                    if line =~ /^\s*(#{MARKER_TYPES{marker_type}})\s*(.*)$/
         
     | 
| 
      
 79 
     | 
    
         
            +
                      marker, cite_string = $1, $2
         
     | 
| 
      
 80 
     | 
    
         
            +
                      if current_citation_string
         
     | 
| 
      
 81 
     | 
    
         
            +
                        current_citation.citation_string = current_citation_string
         
     | 
| 
      
 82 
     | 
    
         
            +
                        citations << current_citation
         
     | 
| 
      
 83 
     | 
    
         
            +
                        current_citation_string = nil
         
     | 
| 
      
 84 
     | 
    
         
            +
                      end
         
     | 
| 
      
 85 
     | 
    
         
            +
                      current_citation = Citation.new
         
     | 
| 
      
 86 
     | 
    
         
            +
                      current_citation.marker_type = marker_type
         
     | 
| 
      
 87 
     | 
    
         
            +
                      current_citation.marker = marker
         
     | 
| 
      
 88 
     | 
    
         
            +
                      current_citation_string = cite_string
         
     | 
| 
      
 89 
     | 
    
         
            +
                    else
         
     | 
| 
      
 90 
     | 
    
         
            +
                      if current_citation_string =~ /\s\-$/
         
     | 
| 
      
 91 
     | 
    
         
            +
                        current_citation_string.sub(/\-$/, '')
         
     | 
| 
      
 92 
     | 
    
         
            +
                        current_citation_string << line
         
     | 
| 
      
 93 
     | 
    
         
            +
                      else
         
     | 
| 
      
 94 
     | 
    
         
            +
                        current_citation_string << " " << line
         
     | 
| 
      
 95 
     | 
    
         
            +
                      end
         
     | 
| 
      
 96 
     | 
    
         
            +
                    end
         
     | 
| 
      
 97 
     | 
    
         
            +
                  }
         
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
                  if current_citation && current_citation_string
         
     | 
| 
      
 100 
     | 
    
         
            +
                    current_citation.string = current_citation_string
         
     | 
| 
      
 101 
     | 
    
         
            +
                    citations << current_citation
         
     | 
| 
      
 102 
     | 
    
         
            +
                  end
         
     | 
| 
      
 103 
     | 
    
         
            +
                  citations
         
     | 
| 
      
 104 
     | 
    
         
            +
                end
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
              end
         
     | 
| 
      
 107 
     | 
    
         
            +
            end
         
     |