excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,322 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'tempfile'
4
+ require 'nokogiri'
5
+ require 'cgi'
6
+ require 'engtagger'
7
+
8
+ module Excite
9
+
10
+ class CRFParser
11
+
12
+ attr_reader :feature_order
13
+ attr_reader :token_features
14
+
15
+ include TokenFeatures
16
+ include Preprocessor
17
+ include Postprocessor
18
+
19
+ DIR = File.dirname(__FILE__)
20
+ TAGGED_REFERENCES = "#{DIR}/resources/trainingdata/tagged_references.txt"
21
+ TAGGED_HTML_REFERENCES = "#{DIR}/resources/trainingdata/tagged_html_references.txt"
22
+ TRAINING_DATA = "#{DIR}/resources/trainingdata/training_data.txt"
23
+ MODEL_FILE = "#{DIR}/resources/model"
24
+ HTML_MODEL_FILE = "#{DIR}/resources/html_model"
25
+ TEMPLATE_FILE = "#{DIR}/resources/parsCit.template"
26
+ HTML_TEMPLATE_FILE = "#{DIR}/resources/html.template"
27
+ CONFIG_FILE = "#{DIR}/../../config/parscit_features.yml"
28
+
29
+ # Feature functions must be performed in alphabetical order, since
30
+ # later functions may depend on earlier ones.
31
+ # TODO This seems pretty confusing and dependent on the current features.
32
+ def initialize(mode=:string)
33
+ @mode = mode
34
+
35
+ f = File.open(CONFIG_FILE, 'r')
36
+ hsh = YAML::load(f)[mode.to_s]
37
+ @feature_order = hsh["feature_order"].map(&:to_sym)
38
+ @token_features = hsh["feature_order"].sort.map(&:to_sym)
39
+ end
40
+
41
+ def model
42
+ @model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
43
+ end
44
+
45
+ def parse(str, presumed_author=nil)
46
+ raw_string = str.dup
47
+
48
+ toks, features = str_2_features(str, false, presumed_author)
49
+ tags, overall_prob, tag_probs = eval_crfpp(features, model)
50
+
51
+ ret = {}
52
+ tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
53
+ ret.each { |k, v| ret[k] = v.join('').strip }
54
+
55
+ normalize_fields(ret)
56
+ ret['raw_string'] = raw_string
57
+ [ret, overall_prob, tag_probs]
58
+ end
59
+
60
+ def eval_crfpp(feat_seq, model)
61
+ model.clear
62
+ feat_seq.each {|vec|
63
+ line = vec.join(" ").strip
64
+ raise unless model.add(line)
65
+ }
66
+ raise unless model.parse
67
+ tags = []
68
+ probs = {}
69
+ feat_seq.length.times {|i|
70
+ tags << model.y2(i)
71
+ probs[model.y2(i)] ||= 1
72
+ probs[model.y2(i)] *= model.prob(i)
73
+ }
74
+ [tags, model.prob, probs]
75
+ end
76
+
77
+ def self.strip_punct(str)
78
+ toknp = str.gsub(/[^\w]/, '')
79
+ toknp = "EMPTY" if toknp.blank? # TODO Seems maybe hacky
80
+ toknp
81
+ end
82
+
83
+ def normalize_input_author(str)
84
+ return nil if str.blank?
85
+ str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
86
+ end
87
+
88
+ def prepare_token_data(raw_string, training=false)
89
+ if training
90
+ tags = tagged_string_2_tags(raw_string.strip)
91
+
92
+ labels, raw_string, joined_tokens = [], '', ''
93
+ tags.each do |tag|
94
+ raw = CGI.unescapeHTML(tag.inner_html)
95
+
96
+ label = tag.name
97
+ raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)
98
+
99
+ toks = str_2_tokens(raw)
100
+
101
+ labels << [label, joined_tokens.length]
102
+ joined_tokens += toks.map(&:raw).join
103
+ raw_string += "\n#{raw}"
104
+ end
105
+ end
106
+
107
+ tokens = str_2_tokens(raw_string.strip)
108
+
109
+ if training
110
+ joined_tokens = ''
111
+ label, _ = labels.shift
112
+ next_label, end_idx = labels.shift unless labels.empty?
113
+
114
+ tokens.each do |tok|
115
+ tok.label = label
116
+ joined_tokens += tok.raw
117
+ if joined_tokens.length == end_idx
118
+ label = next_label
119
+ next_label, end_idx = labels.shift unless labels.empty?
120
+ elsif joined_tokens.length > end_idx && !labels.empty?
121
+ raise "Tokens do not match labels"
122
+ end
123
+ end
124
+ raise "Unused label" unless labels.empty?
125
+ end
126
+
127
+ self.clear
128
+
129
+ return tokens
130
+ end
131
+
132
+ def tagger
133
+ @tagger ||= EngTagger.new
134
+ end
135
+
136
+ def tagged_string_2_tags(str)
137
+ str = "<string>#{str}</string>"
138
+ node = Nokogiri::XML.fragment(str).css('string')
139
+ node.children.reject(&:text?)
140
+ end
141
+
142
+ def str_2_tokens(str)
143
+ if @mode == :html
144
+ toks = html_str_2_tokens(str)
145
+ elsif @mode == :string
146
+ toks = text_str_2_tokens(str)
147
+ end
148
+
149
+ toks.reject { |t| t.empty? }
150
+ end
151
+
152
+ def recognized_labels
153
+ if @mode == :string
154
+ ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
155
+ elsif @mode == :html
156
+ ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
157
+ else
158
+ []
159
+ end
160
+ end
161
+
162
+ def html_str_2_tokens(str)
163
+ html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad
164
+
165
+ tokens = []
166
+ html.traverse do |node|
167
+ tokens += html_text_node_2_tokens(node) if node.text?
168
+ end
169
+ tokens
170
+ end
171
+
172
+ def html_text_node_2_tokens(node)
173
+ text = CGI.unescapeHTML(node.text)
174
+ return [] if text.blank?
175
+
176
+ tokens = text_str_2_tokens(text)
177
+ tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
178
+ tokens
179
+ end
180
+
181
+ def text_str_2_tokens(text)
182
+ tagged = tagger.add_tags(normalize_citation(text))
183
+ tags = tagged_string_2_tags(tagged.gsub('&','&amp;')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
184
+ tags.map { |tag| Token.new(tag.text, tag.name) }
185
+ end
186
+
187
+ # calculate features on the full citation string
188
+ def str_2_features(raw_string, training=false, presumed_author=nil)
189
+ features = []
190
+ tokens = prepare_token_data(raw_string, training)
191
+
192
+ author_names = normalize_input_author(presumed_author)
193
+
194
+ tokens.each_with_index do |tok, toki|
195
+ raise "All tokens must be labeled" if training && tok.label.nil?
196
+
197
+ feats = {}
198
+
199
+ @token_features.each {|f|
200
+ feats[f] = self.send(f, tokens, toki, author_names)
201
+ }
202
+
203
+ features << [tok.raw]
204
+ @feature_order.each {|f| features.last << feats[f]}
205
+ features.last << tok.label if training
206
+ end
207
+
208
+ [tokens, features]
209
+ end
210
+
211
+ def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
212
+ tagged_refs ||= default_tagged_references
213
+
214
+ fin = File.open(tagged_refs, 'r')
215
+ fout = File.open(training_data, 'w')
216
+ x = 0
217
+ while l = fin.gets
218
+ _, data = str_2_features(l.strip, true)
219
+ data.each {|line| fout.write("#{line.join(" ")}\n") }
220
+ fout.write("\n")
221
+ end
222
+
223
+ fin.close
224
+ fout.flush
225
+ fout.close
226
+ end
227
+
228
+ def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
229
+ tagged_refs ||= default_tagged_references
230
+ model ||= default_model_file
231
+ template ||= default_template_file
232
+
233
+ if training_data.nil?
234
+ training_data = TRAINING_DATA
235
+ write_training_file(tagged_refs, training_data)
236
+ end
237
+
238
+ `crf_learn #{template} #{training_data} #{model} -f3 1>&2`
239
+ end
240
+
241
+ def default_tagged_references
242
+ if @mode == :string
243
+ TAGGED_REFERENCES
244
+ elsif @mode == :html
245
+ TAGGED_HTML_REFERENCES
246
+ else
247
+ raise "Unknown mode: #{@mode}"
248
+ end
249
+ end
250
+
251
+ def default_model_file
252
+ if @mode == :string
253
+ MODEL_FILE
254
+ elsif @mode == :html
255
+ HTML_MODEL_FILE
256
+ else
257
+ raise "Unknown mode: #{@mode}"
258
+ end
259
+ end
260
+
261
+ def default_template_file
262
+ if @mode == :string
263
+ TEMPLATE_FILE
264
+ elsif @mode == :html
265
+ HTML_TEMPLATE_FILE
266
+ else
267
+ raise "Unknown mode: #{@mode}"
268
+ end
269
+ end
270
+
271
+ end
272
+
273
+ class TrainingError < Exception; end
274
+
275
+ class Token
276
+
277
+ attr_reader :node, :idx_in_node, :node_token_count, :part_of_speech
278
+ attr_accessor :label
279
+
280
+ def initialize(str, part_of_speech=nil)
281
+ @str = str
282
+ @part_of_speech = part_of_speech
283
+ end
284
+
285
+ def is_in_node!(node, idx_in_node, node_token_count)
286
+ @node = node
287
+ @idx_in_node = idx_in_node
288
+ @node_token_count = node_token_count
289
+ end
290
+
291
+ def raw
292
+ @str
293
+ end
294
+
295
+ def np
296
+ @np ||= CRFParser.strip_punct(@str)
297
+ end
298
+
299
+ def lcnp
300
+ @lcnp ||= np == "EMPTY" ? np : np.downcase
301
+ end
302
+
303
+ def empty?
304
+ raw.strip.blank?
305
+ end
306
+
307
+ def to_s
308
+ "{#{raw}}"
309
+ end
310
+
311
+ def for_join(prev)
312
+ if ['pp','ppc','ppr','pps','rrb', 'pos'].include?(part_of_speech)
313
+ raw
314
+ elsif prev && ['ppd','ppl','lrb'].include?(prev.part_of_speech)
315
+ raw
316
+ else
317
+ " "+raw
318
+ end
319
+ end
320
+ end
321
+
322
+ end
@@ -0,0 +1,252 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ module Postprocessor
6
+
7
+ def normalize_fields(citation_hsh)
8
+ citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
9
+ citation_hsh
10
+ end
11
+
12
+ def method_missing(m, *args, &block)
13
+ # Call normalize on any fields that don't have their own normalization
14
+ # method defined
15
+ if m.to_s =~ /^normalize/
16
+ m.to_s =~ /normalize_(.*)$/
17
+ normalize($1, *args)
18
+ else super
19
+ end
20
+ end
21
+
22
+ # default normalization function for all fields that do not have their
23
+ # own normalization
24
+ # Strip any leading and/or trailing punctuation and space
25
+ def normalize(key, hsh)
26
+ hsh[key].gsub!(/^[^A-Za-z0-9]+/, '')
27
+ hsh[key].gsub!(/[^A-Za-z0-9]+$/, '')
28
+ end
29
+
30
+ # strip leading numerals
31
+ # if the real title is quoted inside this string, try to extract it
32
+ # if the title has at least 2 words before a newline or period or open parens, strip everything after
33
+ # TODO could do better with knowledge of prepositions, names - maybe we just need a second model?
34
+ def normalize_title(hsh)
35
+ str = hsh['title'].strip
36
+
37
+ numeral_regexes = [
38
+ /^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i, # initial numbers + punctuation + space or a quote or a capital letter
39
+ /^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i, # initial roman numerals
40
+ /^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i # initial single letter
41
+ ]
42
+
43
+ numeral_regexes.each do |regex|
44
+ if str.gsub!(regex, '')
45
+ break
46
+ end
47
+ end
48
+
49
+ if (m = str.match /^(["'”’´‘“`'])/)
50
+ quote_char = m[1]
51
+ pairable = pairable_quote_chars(quote_char)
52
+
53
+ if str.scan(/[#{pairable}]/).length >= 2
54
+ str.gsub!(/^#{quote_char}/, '')
55
+ str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
56
+ end
57
+ end
58
+
59
+ while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
60
+ i = str.rindex m[1]
61
+ str = str[0..i-1]
62
+ end
63
+
64
+ hsh['title'] = str
65
+ normalize('title',hsh)
66
+ end
67
+
68
+ def pairable_quote_chars(quote_char)
69
+ [%{"”“}, %{’'`‘´'}].each do |chars|
70
+ return chars if chars.include? quote_char
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Tries to split the author tokens into individual author names
76
+ # and then normalizes these names individually. Returns a
77
+ # list of author names.
78
+ ##
79
+ def normalize_author(hsh)
80
+ str = hsh['author']
81
+ tokens = repair_and_tokenize_author_text(str)
82
+ authors = []
83
+ current_auth = []
84
+ begin_auth = 1
85
+ tokens.each {|tok|
86
+ if tok =~ /^(&|and)$/i
87
+ if !current_auth.empty?
88
+ auth = normalize_author_name(current_auth)
89
+ authors << auth
90
+ end
91
+ current_auth = []
92
+ begin_auth = 1
93
+ next
94
+ end
95
+ if begin_auth > 0
96
+ current_auth << tok
97
+ begin_auth = 0
98
+ next
99
+ end
100
+ if tok =~ /,$/
101
+ current_auth << tok
102
+ if !current_auth.empty?
103
+ auth = normalize_author_name(current_auth)
104
+ authors << auth
105
+ current_auth = []
106
+ begin_auth = 1
107
+ end
108
+ else
109
+ current_auth << tok
110
+ end
111
+ }
112
+ if !current_auth.empty?
113
+ auth = normalize_author_name(current_auth)
114
+ authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
115
+ end
116
+ hsh['authors'] = authors if !authors.empty?
117
+ normalize('author',hsh)
118
+ hsh
119
+ end
120
+
121
+ def normalize_date(hsh)
122
+ str = hsh['date']
123
+ if str =~ /(\d{4})/
124
+ year = $1.to_i
125
+ current_year = Time.now.year
126
+ if year <= current_year+3
127
+ ret = year
128
+ hsh['year'] = ret
129
+ else
130
+ ret = nil
131
+ end
132
+ end
133
+ hsh['date'] = ret
134
+ hsh
135
+ end
136
+
137
+ def normalize_volume(hsh)
138
+ # If there are two numbers, they are volume and number.
139
+ # e.g. "23(2)", "Vol. 23, No. 3" etc...
140
+ if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
141
+ hsh['volume'] = $1
142
+ hsh['number'] = $2
143
+ # Otherwise, just pull out a number and hope that it's the volume
144
+ elsif hsh['volume'] =~ /(\d+)/
145
+ hsh['volume'] = $1
146
+ end
147
+ hsh
148
+ end
149
+
150
+ ##
151
+ # Normalizes page fields into the form "start--end". If the page
152
+ # field does not appear to be in a standard form, does nothing.
153
+ ##
154
+ def normalize_pages(hsh)
155
+ # "vol.issue (year):pp"
156
+ case hsh['pages']
157
+ when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
158
+ hsh['volume'] = $1
159
+ hsh['number'] = $2 if $2
160
+ hsh['year'] = $3 if $3
161
+ hsh['pages'] = $4
162
+ end
163
+
164
+ case hsh['pages']
165
+ when /(\d+)[^\d]+(\d+)/
166
+ hsh['pages'] = "#{$1}--#{$2}"
167
+ when /(\d+)/
168
+ hsh['pages'] = $1
169
+ end
170
+ hsh
171
+ end
172
+
173
+ def repair_and_tokenize_author_text(author_text)
174
+ # Repair obvious parse errors and weird notations.
175
+ author_text.sub!(/et\.? al\.?.*$/, '')
176
+ # FIXME: maybe I'm mis-understanding Perl regular expressions, but
177
+ # this pattern from ParseCit appears to do the Wrong Thing:
178
+ # author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
179
+ author_text.gsub!(/\(.*?\)/, '')
180
+ author_text.gsub!(/^.*?\)\.?/, '')
181
+ author_text.gsub!(/\(.*?$/, '')
182
+ author_text.gsub!(/\[.*?\]/, '')
183
+ author_text.gsub!(/^.*?\]\.?/, '')
184
+ author_text.gsub!(/\[.*?$/, '')
185
+ author_text.gsub!(/;/, ',')
186
+ author_text.gsub!(/,/, ', ')
187
+ author_text.gsub!(/\:/, ' ')
188
+ author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
189
+ author_text = join_multi_word_names(author_text)
190
+
191
+ orig_tokens = author_text.split(/\s+/)
192
+ tokens = []
193
+ last = false
194
+ orig_tokens.each_with_index {|tok, i|
195
+ if tok !~ /[A-Za-z&]/
196
+ if i < orig_tokens.length/2
197
+ tokens = []
198
+ next
199
+ else
200
+ last = true
201
+ end
202
+ end
203
+ if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
204
+ tokens.last =~ /\,$/) or
205
+ tok =~ /^[IVX][IVX]+\.?\,?$/
206
+
207
+ next
208
+ end
209
+ tokens << tok
210
+ break if last
211
+ }
212
+ tokens
213
+ end # repair_and_tokenize_author_text
214
+
215
+ # Insert underscores to join name particles. i.e.
216
+ # Jon de Groote ---> Jon de_Groote
217
+ def join_multi_word_names(author_text)
218
+ author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
219
+ "#{$1}_"
220
+ }
221
+ end
222
+
223
+ ##
224
+ # Tries to normalize an individual author name into the form
225
+ # "First Middle Last", without punctuation.
226
+ ##
227
+ def normalize_author_name(auth_toks)
228
+ return '' if auth_toks.empty?
229
+ str = auth_toks.join(" ")
230
+ if str =~ /(.+),\s*(.+)/
231
+ str = "#{$1} #{$2}"
232
+ end
233
+ str.gsub!(/\.\-/, '-')
234
+ str.gsub!(/[\,\.]/, ' ')
235
+ str.gsub!(/ +/, ' ')
236
+ str.strip!
237
+
238
+ if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
239
+ new_toks = str.split(/\s+/)
240
+ new_order = new_toks[1...new_toks.length];
241
+ new_order << new_toks[0]
242
+ str = new_order.join(" ")
243
+ end
244
+
245
+ str.gsub!(/^[^A-Za-z0-9]+/, '')
246
+ str.gsub!(/[^A-Za-z0-9]+$/, '')
247
+ return str
248
+ end
249
+
250
+ end
251
+
252
+ end
@@ -0,0 +1,107 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ module Preprocessor
6
+
7
+ MARKER_TYPES = {
8
+ :SQUARE => '\\[.+?\\]',
9
+ :PAREN => '\\(.+?\\)',
10
+ :NAKEDNUM => '\\d+',
11
+ :NAKEDNUMDOT => '\\d+\\.',
12
+ }
13
+
14
+ CLEANUP_RULES_FILE = "#{File.dirname(__FILE__)}/../../config/citation_cleanup_rules.yml"
15
+
16
+ def cleanup_rules
17
+ return @rules if @rules
18
+
19
+ raw = YAML.load_file CLEANUP_RULES_FILE
20
+ @rules = raw['order'].map do |rule_name|
21
+ re = Regexp.new(raw['rules'][rule_name]['regex'], raw['rules'][rule_name]['ignore_case'])
22
+ repl = raw['rules'][rule_name]['replacement_str'] || ''
23
+ { re: re, repl: repl }
24
+ end
25
+ end
26
+
27
+ ##
28
+ # Removes lines that appear to be junk from the citation text,
29
+ # and applies cleanup regexes from the configuration file.
30
+ ##
31
+ def normalize_cite_text(cite_text)
32
+ cite_text.split(/\n/).reject do |line|
33
+ line.blank? || line =~ /^[\s\d]*$/
34
+ end.map do |line|
35
+ normalize_citation(line)
36
+ end.join("\n")
37
+ end
38
+
39
+ def normalize_citation(cite)
40
+ cite = cite.dup
41
+
42
+ cleanup_rules.each do |rule|
43
+ cite.gsub!(rule[:re], rule[:repl])
44
+ end
45
+
46
+ cite
47
+ end
48
+
49
+ ##
50
+ # Controls the process by which citations are segmented,
51
+ # based on the result of trying to guess the type of
52
+ # citation marker used in the reference section. Returns
53
+ # a reference to a list of citation objects.
54
+ ##
55
+ def segment_citations(cite_text)
56
+ marker_type = guess_marker_type(cite_text)
57
+ unless marker_type == 'UNKNOWN'
58
+ citations = split_unmarked_citations(cite_text)
59
+ else
60
+ citations = split_citations_by_marker(cite_text, marker_type)
61
+ end
62
+ return citations
63
+ end
64
+
65
+ ##
66
+ # Segments citations that have explicit markers in the
67
+ # reference section. Whenever a new line starts with an
68
+ # expression that matches what we'd expect of a marker,
69
+ # a new citation is started. Returns a reference to a
70
+ # list of citation objects.
71
+ ##
72
+ def split_citations_by_marker(cite_text, marker_type=nil)
73
+ citations = []
74
+ current_citation = Citation.new
75
+ current_citation_string = nil
76
+
77
+ cite_text.split(/\n/).each {|line|
78
+ if line =~ /^\s*(#{MARKER_TYPES{marker_type}})\s*(.*)$/
79
+ marker, cite_string = $1, $2
80
+ if current_citation_string
81
+ current_citation.citation_string = current_citation_string
82
+ citations << current_citation
83
+ current_citation_string = nil
84
+ end
85
+ current_citation = Citation.new
86
+ current_citation.marker_type = marker_type
87
+ current_citation.marker = marker
88
+ current_citation_string = cite_string
89
+ else
90
+ if current_citation_string =~ /\s\-$/
91
+ current_citation_string.sub(/\-$/, '')
92
+ current_citation_string << line
93
+ else
94
+ current_citation_string << " " << line
95
+ end
96
+ end
97
+ }
98
+
99
+ if current_citation && current_citation_string
100
+ current_citation.string = current_citation_string
101
+ citations << current_citation
102
+ end
103
+ citations
104
+ end
105
+
106
+ end
107
+ end