excite 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,322 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'tempfile'
4
+ require 'nokogiri'
5
+ require 'cgi'
6
+ require 'engtagger'
7
+
8
+ module Excite
9
+
10
+ class CRFParser
11
+
12
+ attr_reader :feature_order
13
+ attr_reader :token_features
14
+
15
+ include TokenFeatures
16
+ include Preprocessor
17
+ include Postprocessor
18
+
19
+ DIR = File.dirname(__FILE__)
20
+ TAGGED_REFERENCES = "#{DIR}/resources/trainingdata/tagged_references.txt"
21
+ TAGGED_HTML_REFERENCES = "#{DIR}/resources/trainingdata/tagged_html_references.txt"
22
+ TRAINING_DATA = "#{DIR}/resources/trainingdata/training_data.txt"
23
+ MODEL_FILE = "#{DIR}/resources/model"
24
+ HTML_MODEL_FILE = "#{DIR}/resources/html_model"
25
+ TEMPLATE_FILE = "#{DIR}/resources/parsCit.template"
26
+ HTML_TEMPLATE_FILE = "#{DIR}/resources/html.template"
27
+ CONFIG_FILE = "#{DIR}/../../config/parscit_features.yml"
28
+
29
+ # Feature functions must be performed in alphabetical order, since
30
+ # later functions may depend on earlier ones.
31
+ # TODO This seems pretty confusing and dependent on the current features.
32
+ def initialize(mode=:string)
33
+ @mode = mode
34
+
35
+ f = File.open(CONFIG_FILE, 'r')
36
+ hsh = YAML::load(f)[mode.to_s]
37
+ @feature_order = hsh["feature_order"].map(&:to_sym)
38
+ @token_features = hsh["feature_order"].sort.map(&:to_sym)
39
+ end
40
+
41
+ def model
42
+ @model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
43
+ end
44
+
45
+ def parse(str, presumed_author=nil)
46
+ raw_string = str.dup
47
+
48
+ toks, features = str_2_features(str, false, presumed_author)
49
+ tags, overall_prob, tag_probs = eval_crfpp(features, model)
50
+
51
+ ret = {}
52
+ tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
53
+ ret.each { |k, v| ret[k] = v.join('').strip }
54
+
55
+ normalize_fields(ret)
56
+ ret['raw_string'] = raw_string
57
+ [ret, overall_prob, tag_probs]
58
+ end
59
+
60
+ def eval_crfpp(feat_seq, model)
61
+ model.clear
62
+ feat_seq.each {|vec|
63
+ line = vec.join(" ").strip
64
+ raise unless model.add(line)
65
+ }
66
+ raise unless model.parse
67
+ tags = []
68
+ probs = {}
69
+ feat_seq.length.times {|i|
70
+ tags << model.y2(i)
71
+ probs[model.y2(i)] ||= 1
72
+ probs[model.y2(i)] *= model.prob(i)
73
+ }
74
+ [tags, model.prob, probs]
75
+ end
76
+
77
+ def self.strip_punct(str)
78
+ toknp = str.gsub(/[^\w]/, '')
79
+ toknp = "EMPTY" if toknp.blank? # TODO Seems maybe hacky
80
+ toknp
81
+ end
82
+
83
+ def normalize_input_author(str)
84
+ return nil if str.blank?
85
+ str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
86
+ end
87
+
88
+ def prepare_token_data(raw_string, training=false)
89
+ if training
90
+ tags = tagged_string_2_tags(raw_string.strip)
91
+
92
+ labels, raw_string, joined_tokens = [], '', ''
93
+ tags.each do |tag|
94
+ raw = CGI.unescapeHTML(tag.inner_html)
95
+
96
+ label = tag.name
97
+ raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)
98
+
99
+ toks = str_2_tokens(raw)
100
+
101
+ labels << [label, joined_tokens.length]
102
+ joined_tokens += toks.map(&:raw).join
103
+ raw_string += "\n#{raw}"
104
+ end
105
+ end
106
+
107
+ tokens = str_2_tokens(raw_string.strip)
108
+
109
+ if training
110
+ joined_tokens = ''
111
+ label, _ = labels.shift
112
+ next_label, end_idx = labels.shift unless labels.empty?
113
+
114
+ tokens.each do |tok|
115
+ tok.label = label
116
+ joined_tokens += tok.raw
117
+ if joined_tokens.length == end_idx
118
+ label = next_label
119
+ next_label, end_idx = labels.shift unless labels.empty?
120
+ elsif joined_tokens.length > end_idx && !labels.empty?
121
+ raise "Tokens do not match labels"
122
+ end
123
+ end
124
+ raise "Unused label" unless labels.empty?
125
+ end
126
+
127
+ self.clear
128
+
129
+ return tokens
130
+ end
131
+
132
+ def tagger
133
+ @tagger ||= EngTagger.new
134
+ end
135
+
136
+ def tagged_string_2_tags(str)
137
+ str = "<string>#{str}</string>"
138
+ node = Nokogiri::XML.fragment(str).css('string')
139
+ node.children.reject(&:text?)
140
+ end
141
+
142
+ def str_2_tokens(str)
143
+ if @mode == :html
144
+ toks = html_str_2_tokens(str)
145
+ elsif @mode == :string
146
+ toks = text_str_2_tokens(str)
147
+ end
148
+
149
+ toks.reject { |t| t.empty? }
150
+ end
151
+
152
+ def recognized_labels
153
+ if @mode == :string
154
+ ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
155
+ elsif @mode == :html
156
+ ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
157
+ else
158
+ []
159
+ end
160
+ end
161
+
162
+ def html_str_2_tokens(str)
163
+ html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad
164
+
165
+ tokens = []
166
+ html.traverse do |node|
167
+ tokens += html_text_node_2_tokens(node) if node.text?
168
+ end
169
+ tokens
170
+ end
171
+
172
+ def html_text_node_2_tokens(node)
173
+ text = CGI.unescapeHTML(node.text)
174
+ return [] if text.blank?
175
+
176
+ tokens = text_str_2_tokens(text)
177
+ tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
178
+ tokens
179
+ end
180
+
181
+ def text_str_2_tokens(text)
182
+ tagged = tagger.add_tags(normalize_citation(text))
183
+ tags = tagged_string_2_tags(tagged.gsub('&','&amp;')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
184
+ tags.map { |tag| Token.new(tag.text, tag.name) }
185
+ end
186
+
187
+ # calculate features on the full citation string
188
+ def str_2_features(raw_string, training=false, presumed_author=nil)
189
+ features = []
190
+ tokens = prepare_token_data(raw_string, training)
191
+
192
+ author_names = normalize_input_author(presumed_author)
193
+
194
+ tokens.each_with_index do |tok, toki|
195
+ raise "All tokens must be labeled" if training && tok.label.nil?
196
+
197
+ feats = {}
198
+
199
+ @token_features.each {|f|
200
+ feats[f] = self.send(f, tokens, toki, author_names)
201
+ }
202
+
203
+ features << [tok.raw]
204
+ @feature_order.each {|f| features.last << feats[f]}
205
+ features.last << tok.label if training
206
+ end
207
+
208
+ [tokens, features]
209
+ end
210
+
211
+ def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
212
+ tagged_refs ||= default_tagged_references
213
+
214
+ fin = File.open(tagged_refs, 'r')
215
+ fout = File.open(training_data, 'w')
216
+ x = 0
217
+ while l = fin.gets
218
+ _, data = str_2_features(l.strip, true)
219
+ data.each {|line| fout.write("#{line.join(" ")}\n") }
220
+ fout.write("\n")
221
+ end
222
+
223
+ fin.close
224
+ fout.flush
225
+ fout.close
226
+ end
227
+
228
+ def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
229
+ tagged_refs ||= default_tagged_references
230
+ model ||= default_model_file
231
+ template ||= default_template_file
232
+
233
+ if training_data.nil?
234
+ training_data = TRAINING_DATA
235
+ write_training_file(tagged_refs, training_data)
236
+ end
237
+
238
+ `crf_learn #{template} #{training_data} #{model} -f3 1>&2`
239
+ end
240
+
241
+ def default_tagged_references
242
+ if @mode == :string
243
+ TAGGED_REFERENCES
244
+ elsif @mode == :html
245
+ TAGGED_HTML_REFERENCES
246
+ else
247
+ raise "Unknown mode: #{@mode}"
248
+ end
249
+ end
250
+
251
+ def default_model_file
252
+ if @mode == :string
253
+ MODEL_FILE
254
+ elsif @mode == :html
255
+ HTML_MODEL_FILE
256
+ else
257
+ raise "Unknown mode: #{@mode}"
258
+ end
259
+ end
260
+
261
+ def default_template_file
262
+ if @mode == :string
263
+ TEMPLATE_FILE
264
+ elsif @mode == :html
265
+ HTML_TEMPLATE_FILE
266
+ else
267
+ raise "Unknown mode: #{@mode}"
268
+ end
269
+ end
270
+
271
+ end
272
+
273
+ class TrainingError < Exception; end
274
+
275
+ class Token
276
+
277
+ attr_reader :node, :idx_in_node, :node_token_count, :part_of_speech
278
+ attr_accessor :label
279
+
280
+ def initialize(str, part_of_speech=nil)
281
+ @str = str
282
+ @part_of_speech = part_of_speech
283
+ end
284
+
285
+ def is_in_node!(node, idx_in_node, node_token_count)
286
+ @node = node
287
+ @idx_in_node = idx_in_node
288
+ @node_token_count = node_token_count
289
+ end
290
+
291
+ def raw
292
+ @str
293
+ end
294
+
295
+ def np
296
+ @np ||= CRFParser.strip_punct(@str)
297
+ end
298
+
299
+ def lcnp
300
+ @lcnp ||= np == "EMPTY" ? np : np.downcase
301
+ end
302
+
303
+ def empty?
304
+ raw.strip.blank?
305
+ end
306
+
307
+ def to_s
308
+ "{#{raw}}"
309
+ end
310
+
311
+ def for_join(prev)
312
+ if ['pp','ppc','ppr','pps','rrb', 'pos'].include?(part_of_speech)
313
+ raw
314
+ elsif prev && ['ppd','ppl','lrb'].include?(prev.part_of_speech)
315
+ raw
316
+ else
317
+ " "+raw
318
+ end
319
+ end
320
+ end
321
+
322
+ end
@@ -0,0 +1,252 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ module Postprocessor
6
+
7
+ def normalize_fields(citation_hsh)
8
+ citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
9
+ citation_hsh
10
+ end
11
+
12
+ def method_missing(m, *args, &block)
13
+ # Call normalize on any fields that don't have their own normalization
14
+ # method defined
15
+ if m.to_s =~ /^normalize/
16
+ m.to_s =~ /normalize_(.*)$/
17
+ normalize($1, *args)
18
+ else super
19
+ end
20
+ end
21
+
22
+ # default normalization function for all fields that do not have their
23
+ # own normalization
24
+ # Strip any leading and/or trailing punctuation and space
25
+ def normalize(key, hsh)
26
+ hsh[key].gsub!(/^[^A-Za-z0-9]+/, '')
27
+ hsh[key].gsub!(/[^A-Za-z0-9]+$/, '')
28
+ end
29
+
30
+ # strip leading numerals
31
+ # if the real title is quoted inside this string, try to extract it
32
+ # if the title has at least 2 words before a newline or period or open parens, strip everything after
33
+ # TODO could do better with knowledge of prepositions, names - maybe we just need a second model?
34
+ def normalize_title(hsh)
35
+ str = hsh['title'].strip
36
+
37
+ numeral_regexes = [
38
+ /^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i, # initial numbers + punctuation + space or a quote or a capital letter
39
+ /^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i, # initial roman numerals
40
+ /^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i # initial single letter
41
+ ]
42
+
43
+ numeral_regexes.each do |regex|
44
+ if str.gsub!(regex, '')
45
+ break
46
+ end
47
+ end
48
+
49
+ if (m = str.match /^(["'”’´‘“`'])/)
50
+ quote_char = m[1]
51
+ pairable = pairable_quote_chars(quote_char)
52
+
53
+ if str.scan(/[#{pairable}]/).length >= 2
54
+ str.gsub!(/^#{quote_char}/, '')
55
+ str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
56
+ end
57
+ end
58
+
59
+ while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
60
+ i = str.rindex m[1]
61
+ str = str[0..i-1]
62
+ end
63
+
64
+ hsh['title'] = str
65
+ normalize('title',hsh)
66
+ end
67
+
68
+ def pairable_quote_chars(quote_char)
69
+ [%{"”“}, %{’'`‘´'}].each do |chars|
70
+ return chars if chars.include? quote_char
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Tries to split the author tokens into individual author names
76
+ # and then normalizes these names individually. Returns a
77
+ # list of author names.
78
+ ##
79
+ def normalize_author(hsh)
80
+ str = hsh['author']
81
+ tokens = repair_and_tokenize_author_text(str)
82
+ authors = []
83
+ current_auth = []
84
+ begin_auth = 1
85
+ tokens.each {|tok|
86
+ if tok =~ /^(&|and)$/i
87
+ if !current_auth.empty?
88
+ auth = normalize_author_name(current_auth)
89
+ authors << auth
90
+ end
91
+ current_auth = []
92
+ begin_auth = 1
93
+ next
94
+ end
95
+ if begin_auth > 0
96
+ current_auth << tok
97
+ begin_auth = 0
98
+ next
99
+ end
100
+ if tok =~ /,$/
101
+ current_auth << tok
102
+ if !current_auth.empty?
103
+ auth = normalize_author_name(current_auth)
104
+ authors << auth
105
+ current_auth = []
106
+ begin_auth = 1
107
+ end
108
+ else
109
+ current_auth << tok
110
+ end
111
+ }
112
+ if !current_auth.empty?
113
+ auth = normalize_author_name(current_auth)
114
+ authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
115
+ end
116
+ hsh['authors'] = authors if !authors.empty?
117
+ normalize('author',hsh)
118
+ hsh
119
+ end
120
+
121
+ def normalize_date(hsh)
122
+ str = hsh['date']
123
+ if str =~ /(\d{4})/
124
+ year = $1.to_i
125
+ current_year = Time.now.year
126
+ if year <= current_year+3
127
+ ret = year
128
+ hsh['year'] = ret
129
+ else
130
+ ret = nil
131
+ end
132
+ end
133
+ hsh['date'] = ret
134
+ hsh
135
+ end
136
+
137
+ def normalize_volume(hsh)
138
+ # If there are two numbers, they are volume and number.
139
+ # e.g. "23(2)", "Vol. 23, No. 3" etc...
140
+ if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
141
+ hsh['volume'] = $1
142
+ hsh['number'] = $2
143
+ # Otherwise, just pull out a number and hope that it's the volume
144
+ elsif hsh['volume'] =~ /(\d+)/
145
+ hsh['volume'] = $1
146
+ end
147
+ hsh
148
+ end
149
+
150
+ ##
151
+ # Normalizes page fields into the form "start--end". If the page
152
+ # field does not appear to be in a standard form, does nothing.
153
+ ##
154
+ def normalize_pages(hsh)
155
+ # "vol.issue (year):pp"
156
+ case hsh['pages']
157
+ when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
158
+ hsh['volume'] = $1
159
+ hsh['number'] = $2 if $2
160
+ hsh['year'] = $3 if $3
161
+ hsh['pages'] = $4
162
+ end
163
+
164
+ case hsh['pages']
165
+ when /(\d+)[^\d]+(\d+)/
166
+ hsh['pages'] = "#{$1}--#{$2}"
167
+ when /(\d+)/
168
+ hsh['pages'] = $1
169
+ end
170
+ hsh
171
+ end
172
+
173
+ def repair_and_tokenize_author_text(author_text)
174
+ # Repair obvious parse errors and weird notations.
175
+ author_text.sub!(/et\.? al\.?.*$/, '')
176
+ # FIXME: maybe I'm mis-understanding Perl regular expressions, but
177
+ # this pattern from ParseCit appears to do the Wrong Thing:
178
+ # author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
179
+ author_text.gsub!(/\(.*?\)/, '')
180
+ author_text.gsub!(/^.*?\)\.?/, '')
181
+ author_text.gsub!(/\(.*?$/, '')
182
+ author_text.gsub!(/\[.*?\]/, '')
183
+ author_text.gsub!(/^.*?\]\.?/, '')
184
+ author_text.gsub!(/\[.*?$/, '')
185
+ author_text.gsub!(/;/, ',')
186
+ author_text.gsub!(/,/, ', ')
187
+ author_text.gsub!(/\:/, ' ')
188
+ author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
189
+ author_text = join_multi_word_names(author_text)
190
+
191
+ orig_tokens = author_text.split(/\s+/)
192
+ tokens = []
193
+ last = false
194
+ orig_tokens.each_with_index {|tok, i|
195
+ if tok !~ /[A-Za-z&]/
196
+ if i < orig_tokens.length/2
197
+ tokens = []
198
+ next
199
+ else
200
+ last = true
201
+ end
202
+ end
203
+ if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
204
+ tokens.last =~ /\,$/) or
205
+ tok =~ /^[IVX][IVX]+\.?\,?$/
206
+
207
+ next
208
+ end
209
+ tokens << tok
210
+ break if last
211
+ }
212
+ tokens
213
+ end # repair_and_tokenize_author_text
214
+
215
+ # Insert underscores to join name particles. i.e.
216
+ # Jon de Groote ---> Jon de_Groote
217
+ def join_multi_word_names(author_text)
218
+ author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
219
+ "#{$1}_"
220
+ }
221
+ end
222
+
223
+ ##
224
+ # Tries to normalize an individual author name into the form
225
+ # "First Middle Last", without punctuation.
226
+ ##
227
+ def normalize_author_name(auth_toks)
228
+ return '' if auth_toks.empty?
229
+ str = auth_toks.join(" ")
230
+ if str =~ /(.+),\s*(.+)/
231
+ str = "#{$1} #{$2}"
232
+ end
233
+ str.gsub!(/\.\-/, '-')
234
+ str.gsub!(/[\,\.]/, ' ')
235
+ str.gsub!(/ +/, ' ')
236
+ str.strip!
237
+
238
+ if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
239
+ new_toks = str.split(/\s+/)
240
+ new_order = new_toks[1...new_toks.length];
241
+ new_order << new_toks[0]
242
+ str = new_order.join(" ")
243
+ end
244
+
245
+ str.gsub!(/^[^A-Za-z0-9]+/, '')
246
+ str.gsub!(/[^A-Za-z0-9]+$/, '')
247
+ return str
248
+ end
249
+
250
+ end
251
+
252
+ end
@@ -0,0 +1,107 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ module Preprocessor
6
+
7
+ MARKER_TYPES = {
8
+ :SQUARE => '\\[.+?\\]',
9
+ :PAREN => '\\(.+?\\)',
10
+ :NAKEDNUM => '\\d+',
11
+ :NAKEDNUMDOT => '\\d+\\.',
12
+ }
13
+
14
+ CLEANUP_RULES_FILE = "#{File.dirname(__FILE__)}/../../config/citation_cleanup_rules.yml"
15
+
16
+ def cleanup_rules
17
+ return @rules if @rules
18
+
19
+ raw = YAML.load_file CLEANUP_RULES_FILE
20
+ @rules = raw['order'].map do |rule_name|
21
+ re = Regexp.new(raw['rules'][rule_name]['regex'], raw['rules'][rule_name]['ignore_case'])
22
+ repl = raw['rules'][rule_name]['replacement_str'] || ''
23
+ { re: re, repl: repl }
24
+ end
25
+ end
26
+
27
+ ##
28
+ # Removes lines that appear to be junk from the citation text,
29
+ # and applies cleanup regexes from the configuration file.
30
+ ##
31
+ def normalize_cite_text(cite_text)
32
+ cite_text.split(/\n/).reject do |line|
33
+ line.blank? || line =~ /^[\s\d]*$/
34
+ end.map do |line|
35
+ normalize_citation(line)
36
+ end.join("\n")
37
+ end
38
+
39
+ def normalize_citation(cite)
40
+ cite = cite.dup
41
+
42
+ cleanup_rules.each do |rule|
43
+ cite.gsub!(rule[:re], rule[:repl])
44
+ end
45
+
46
+ cite
47
+ end
48
+
49
+ ##
50
+ # Controls the process by which citations are segmented,
51
+ # based on the result of trying to guess the type of
52
+ # citation marker used in the reference section. Returns
53
+ # a reference to a list of citation objects.
54
+ ##
55
+ def segment_citations(cite_text)
56
+ marker_type = guess_marker_type(cite_text)
57
+ unless marker_type == 'UNKNOWN'
58
+ citations = split_unmarked_citations(cite_text)
59
+ else
60
+ citations = split_citations_by_marker(cite_text, marker_type)
61
+ end
62
+ return citations
63
+ end
64
+
65
+ ##
66
+ # Segments citations that have explicit markers in the
67
+ # reference section. Whenever a new line starts with an
68
+ # expression that matches what we'd expect of a marker,
69
+ # a new citation is started. Returns a reference to a
70
+ # list of citation objects.
71
+ ##
72
+ def split_citations_by_marker(cite_text, marker_type=nil)
73
+ citations = []
74
+ current_citation = Citation.new
75
+ current_citation_string = nil
76
+
77
+ cite_text.split(/\n/).each {|line|
78
+ if line =~ /^\s*(#{MARKER_TYPES{marker_type}})\s*(.*)$/
79
+ marker, cite_string = $1, $2
80
+ if current_citation_string
81
+ current_citation.citation_string = current_citation_string
82
+ citations << current_citation
83
+ current_citation_string = nil
84
+ end
85
+ current_citation = Citation.new
86
+ current_citation.marker_type = marker_type
87
+ current_citation.marker = marker
88
+ current_citation_string = cite_string
89
+ else
90
+ if current_citation_string =~ /\s\-$/
91
+ current_citation_string.sub(/\-$/, '')
92
+ current_citation_string << line
93
+ else
94
+ current_citation_string << " " << line
95
+ end
96
+ end
97
+ }
98
+
99
+ if current_citation && current_citation_string
100
+ current_citation.string = current_citation_string
101
+ citations << current_citation
102
+ end
103
+ citations
104
+ end
105
+
106
+ end
107
+ end