excite 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,322 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'tempfile'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'cgi'
|
6
|
+
require 'engtagger'
|
7
|
+
|
8
|
+
module Excite
|
9
|
+
|
10
|
+
class CRFParser
|
11
|
+
|
12
|
+
attr_reader :feature_order
|
13
|
+
attr_reader :token_features
|
14
|
+
|
15
|
+
include TokenFeatures
|
16
|
+
include Preprocessor
|
17
|
+
include Postprocessor
|
18
|
+
|
19
|
+
DIR = File.dirname(__FILE__)
|
20
|
+
TAGGED_REFERENCES = "#{DIR}/resources/trainingdata/tagged_references.txt"
|
21
|
+
TAGGED_HTML_REFERENCES = "#{DIR}/resources/trainingdata/tagged_html_references.txt"
|
22
|
+
TRAINING_DATA = "#{DIR}/resources/trainingdata/training_data.txt"
|
23
|
+
MODEL_FILE = "#{DIR}/resources/model"
|
24
|
+
HTML_MODEL_FILE = "#{DIR}/resources/html_model"
|
25
|
+
TEMPLATE_FILE = "#{DIR}/resources/parsCit.template"
|
26
|
+
HTML_TEMPLATE_FILE = "#{DIR}/resources/html.template"
|
27
|
+
CONFIG_FILE = "#{DIR}/../../config/parscit_features.yml"
|
28
|
+
|
29
|
+
# Feature functions must be performed in alphabetical order, since
|
30
|
+
# later functions may depend on earlier ones.
|
31
|
+
# TODO This seems pretty confusing and dependent on the current features.
|
32
|
+
def initialize(mode=:string)
|
33
|
+
@mode = mode
|
34
|
+
|
35
|
+
f = File.open(CONFIG_FILE, 'r')
|
36
|
+
hsh = YAML::load(f)[mode.to_s]
|
37
|
+
@feature_order = hsh["feature_order"].map(&:to_sym)
|
38
|
+
@token_features = hsh["feature_order"].sort.map(&:to_sym)
|
39
|
+
end
|
40
|
+
|
41
|
+
def model
|
42
|
+
@model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
|
43
|
+
end
|
44
|
+
|
45
|
+
def parse(str, presumed_author=nil)
|
46
|
+
raw_string = str.dup
|
47
|
+
|
48
|
+
toks, features = str_2_features(str, false, presumed_author)
|
49
|
+
tags, overall_prob, tag_probs = eval_crfpp(features, model)
|
50
|
+
|
51
|
+
ret = {}
|
52
|
+
tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
|
53
|
+
ret.each { |k, v| ret[k] = v.join('').strip }
|
54
|
+
|
55
|
+
normalize_fields(ret)
|
56
|
+
ret['raw_string'] = raw_string
|
57
|
+
[ret, overall_prob, tag_probs]
|
58
|
+
end
|
59
|
+
|
60
|
+
def eval_crfpp(feat_seq, model)
|
61
|
+
model.clear
|
62
|
+
feat_seq.each {|vec|
|
63
|
+
line = vec.join(" ").strip
|
64
|
+
raise unless model.add(line)
|
65
|
+
}
|
66
|
+
raise unless model.parse
|
67
|
+
tags = []
|
68
|
+
probs = {}
|
69
|
+
feat_seq.length.times {|i|
|
70
|
+
tags << model.y2(i)
|
71
|
+
probs[model.y2(i)] ||= 1
|
72
|
+
probs[model.y2(i)] *= model.prob(i)
|
73
|
+
}
|
74
|
+
[tags, model.prob, probs]
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.strip_punct(str)
|
78
|
+
toknp = str.gsub(/[^\w]/, '')
|
79
|
+
toknp = "EMPTY" if toknp.blank? # TODO Seems maybe hacky
|
80
|
+
toknp
|
81
|
+
end
|
82
|
+
|
83
|
+
def normalize_input_author(str)
|
84
|
+
return nil if str.blank?
|
85
|
+
str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
|
86
|
+
end
|
87
|
+
|
88
|
+
def prepare_token_data(raw_string, training=false)
|
89
|
+
if training
|
90
|
+
tags = tagged_string_2_tags(raw_string.strip)
|
91
|
+
|
92
|
+
labels, raw_string, joined_tokens = [], '', ''
|
93
|
+
tags.each do |tag|
|
94
|
+
raw = CGI.unescapeHTML(tag.inner_html)
|
95
|
+
|
96
|
+
label = tag.name
|
97
|
+
raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)
|
98
|
+
|
99
|
+
toks = str_2_tokens(raw)
|
100
|
+
|
101
|
+
labels << [label, joined_tokens.length]
|
102
|
+
joined_tokens += toks.map(&:raw).join
|
103
|
+
raw_string += "\n#{raw}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
tokens = str_2_tokens(raw_string.strip)
|
108
|
+
|
109
|
+
if training
|
110
|
+
joined_tokens = ''
|
111
|
+
label, _ = labels.shift
|
112
|
+
next_label, end_idx = labels.shift unless labels.empty?
|
113
|
+
|
114
|
+
tokens.each do |tok|
|
115
|
+
tok.label = label
|
116
|
+
joined_tokens += tok.raw
|
117
|
+
if joined_tokens.length == end_idx
|
118
|
+
label = next_label
|
119
|
+
next_label, end_idx = labels.shift unless labels.empty?
|
120
|
+
elsif joined_tokens.length > end_idx && !labels.empty?
|
121
|
+
raise "Tokens do not match labels"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
raise "Unused label" unless labels.empty?
|
125
|
+
end
|
126
|
+
|
127
|
+
self.clear
|
128
|
+
|
129
|
+
return tokens
|
130
|
+
end
|
131
|
+
|
132
|
+
def tagger
|
133
|
+
@tagger ||= EngTagger.new
|
134
|
+
end
|
135
|
+
|
136
|
+
def tagged_string_2_tags(str)
|
137
|
+
str = "<string>#{str}</string>"
|
138
|
+
node = Nokogiri::XML.fragment(str).css('string')
|
139
|
+
node.children.reject(&:text?)
|
140
|
+
end
|
141
|
+
|
142
|
+
def str_2_tokens(str)
|
143
|
+
if @mode == :html
|
144
|
+
toks = html_str_2_tokens(str)
|
145
|
+
elsif @mode == :string
|
146
|
+
toks = text_str_2_tokens(str)
|
147
|
+
end
|
148
|
+
|
149
|
+
toks.reject { |t| t.empty? }
|
150
|
+
end
|
151
|
+
|
152
|
+
def recognized_labels
|
153
|
+
if @mode == :string
|
154
|
+
["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
|
155
|
+
elsif @mode == :html
|
156
|
+
["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
|
157
|
+
else
|
158
|
+
[]
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def html_str_2_tokens(str)
|
163
|
+
html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad
|
164
|
+
|
165
|
+
tokens = []
|
166
|
+
html.traverse do |node|
|
167
|
+
tokens += html_text_node_2_tokens(node) if node.text?
|
168
|
+
end
|
169
|
+
tokens
|
170
|
+
end
|
171
|
+
|
172
|
+
def html_text_node_2_tokens(node)
|
173
|
+
text = CGI.unescapeHTML(node.text)
|
174
|
+
return [] if text.blank?
|
175
|
+
|
176
|
+
tokens = text_str_2_tokens(text)
|
177
|
+
tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
|
178
|
+
tokens
|
179
|
+
end
|
180
|
+
|
181
|
+
def text_str_2_tokens(text)
|
182
|
+
tagged = tagger.add_tags(normalize_citation(text))
|
183
|
+
tags = tagged_string_2_tags(tagged.gsub('&','&')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
|
184
|
+
tags.map { |tag| Token.new(tag.text, tag.name) }
|
185
|
+
end
|
186
|
+
|
187
|
+
# calculate features on the full citation string
|
188
|
+
def str_2_features(raw_string, training=false, presumed_author=nil)
|
189
|
+
features = []
|
190
|
+
tokens = prepare_token_data(raw_string, training)
|
191
|
+
|
192
|
+
author_names = normalize_input_author(presumed_author)
|
193
|
+
|
194
|
+
tokens.each_with_index do |tok, toki|
|
195
|
+
raise "All tokens must be labeled" if training && tok.label.nil?
|
196
|
+
|
197
|
+
feats = {}
|
198
|
+
|
199
|
+
@token_features.each {|f|
|
200
|
+
feats[f] = self.send(f, tokens, toki, author_names)
|
201
|
+
}
|
202
|
+
|
203
|
+
features << [tok.raw]
|
204
|
+
@feature_order.each {|f| features.last << feats[f]}
|
205
|
+
features.last << tok.label if training
|
206
|
+
end
|
207
|
+
|
208
|
+
[tokens, features]
|
209
|
+
end
|
210
|
+
|
211
|
+
def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
|
212
|
+
tagged_refs ||= default_tagged_references
|
213
|
+
|
214
|
+
fin = File.open(tagged_refs, 'r')
|
215
|
+
fout = File.open(training_data, 'w')
|
216
|
+
x = 0
|
217
|
+
while l = fin.gets
|
218
|
+
_, data = str_2_features(l.strip, true)
|
219
|
+
data.each {|line| fout.write("#{line.join(" ")}\n") }
|
220
|
+
fout.write("\n")
|
221
|
+
end
|
222
|
+
|
223
|
+
fin.close
|
224
|
+
fout.flush
|
225
|
+
fout.close
|
226
|
+
end
|
227
|
+
|
228
|
+
def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
|
229
|
+
tagged_refs ||= default_tagged_references
|
230
|
+
model ||= default_model_file
|
231
|
+
template ||= default_template_file
|
232
|
+
|
233
|
+
if training_data.nil?
|
234
|
+
training_data = TRAINING_DATA
|
235
|
+
write_training_file(tagged_refs, training_data)
|
236
|
+
end
|
237
|
+
|
238
|
+
`crf_learn #{template} #{training_data} #{model} -f3 1>&2`
|
239
|
+
end
|
240
|
+
|
241
|
+
def default_tagged_references
|
242
|
+
if @mode == :string
|
243
|
+
TAGGED_REFERENCES
|
244
|
+
elsif @mode == :html
|
245
|
+
TAGGED_HTML_REFERENCES
|
246
|
+
else
|
247
|
+
raise "Unknown mode: #{@mode}"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def default_model_file
|
252
|
+
if @mode == :string
|
253
|
+
MODEL_FILE
|
254
|
+
elsif @mode == :html
|
255
|
+
HTML_MODEL_FILE
|
256
|
+
else
|
257
|
+
raise "Unknown mode: #{@mode}"
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def default_template_file
|
262
|
+
if @mode == :string
|
263
|
+
TEMPLATE_FILE
|
264
|
+
elsif @mode == :html
|
265
|
+
HTML_TEMPLATE_FILE
|
266
|
+
else
|
267
|
+
raise "Unknown mode: #{@mode}"
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
end
|
272
|
+
|
273
|
+
class TrainingError < Exception; end
|
274
|
+
|
275
|
+
class Token
|
276
|
+
|
277
|
+
attr_reader :node, :idx_in_node, :node_token_count, :part_of_speech
|
278
|
+
attr_accessor :label
|
279
|
+
|
280
|
+
def initialize(str, part_of_speech=nil)
|
281
|
+
@str = str
|
282
|
+
@part_of_speech = part_of_speech
|
283
|
+
end
|
284
|
+
|
285
|
+
def is_in_node!(node, idx_in_node, node_token_count)
|
286
|
+
@node = node
|
287
|
+
@idx_in_node = idx_in_node
|
288
|
+
@node_token_count = node_token_count
|
289
|
+
end
|
290
|
+
|
291
|
+
def raw
|
292
|
+
@str
|
293
|
+
end
|
294
|
+
|
295
|
+
def np
|
296
|
+
@np ||= CRFParser.strip_punct(@str)
|
297
|
+
end
|
298
|
+
|
299
|
+
def lcnp
|
300
|
+
@lcnp ||= np == "EMPTY" ? np : np.downcase
|
301
|
+
end
|
302
|
+
|
303
|
+
def empty?
|
304
|
+
raw.strip.blank?
|
305
|
+
end
|
306
|
+
|
307
|
+
def to_s
|
308
|
+
"{#{raw}}"
|
309
|
+
end
|
310
|
+
|
311
|
+
def for_join(prev)
|
312
|
+
if ['pp','ppc','ppr','pps','rrb', 'pos'].include?(part_of_speech)
|
313
|
+
raw
|
314
|
+
elsif prev && ['ppd','ppl','lrb'].include?(prev.part_of_speech)
|
315
|
+
raw
|
316
|
+
else
|
317
|
+
" "+raw
|
318
|
+
end
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
end
|
@@ -0,0 +1,252 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
|
5
|
+
module Postprocessor
|
6
|
+
|
7
|
+
def normalize_fields(citation_hsh)
|
8
|
+
citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
|
9
|
+
citation_hsh
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(m, *args, &block)
|
13
|
+
# Call normalize on any fields that don't have their own normalization
|
14
|
+
# method defined
|
15
|
+
if m.to_s =~ /^normalize/
|
16
|
+
m.to_s =~ /normalize_(.*)$/
|
17
|
+
normalize($1, *args)
|
18
|
+
else super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# default normalization function for all fields that do not have their
|
23
|
+
# own normalization
|
24
|
+
# Strip any leading and/or trailing punctuation and space
|
25
|
+
def normalize(key, hsh)
|
26
|
+
hsh[key].gsub!(/^[^A-Za-z0-9]+/, '')
|
27
|
+
hsh[key].gsub!(/[^A-Za-z0-9]+$/, '')
|
28
|
+
end
|
29
|
+
|
30
|
+
# strip leading numerals
|
31
|
+
# if the real title is quoted inside this string, try to extract it
|
32
|
+
# if the title has at least 2 words before a newline or period or open parens, strip everything after
|
33
|
+
# TODO could do better with knowledge of prepositions, names - maybe we just need a second model?
|
34
|
+
def normalize_title(hsh)
|
35
|
+
str = hsh['title'].strip
|
36
|
+
|
37
|
+
numeral_regexes = [
|
38
|
+
/^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i, # initial numbers + punctuation + space or a quote or a capital letter
|
39
|
+
/^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i, # initial roman numerals
|
40
|
+
/^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i # initial single letter
|
41
|
+
]
|
42
|
+
|
43
|
+
numeral_regexes.each do |regex|
|
44
|
+
if str.gsub!(regex, '')
|
45
|
+
break
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
if (m = str.match /^(["'”’´‘“`'])/)
|
50
|
+
quote_char = m[1]
|
51
|
+
pairable = pairable_quote_chars(quote_char)
|
52
|
+
|
53
|
+
if str.scan(/[#{pairable}]/).length >= 2
|
54
|
+
str.gsub!(/^#{quote_char}/, '')
|
55
|
+
str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
|
60
|
+
i = str.rindex m[1]
|
61
|
+
str = str[0..i-1]
|
62
|
+
end
|
63
|
+
|
64
|
+
hsh['title'] = str
|
65
|
+
normalize('title',hsh)
|
66
|
+
end
|
67
|
+
|
68
|
+
def pairable_quote_chars(quote_char)
|
69
|
+
[%{"”“}, %{’'`‘´'}].each do |chars|
|
70
|
+
return chars if chars.include? quote_char
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Tries to split the author tokens into individual author names
|
76
|
+
# and then normalizes these names individually. Returns a
|
77
|
+
# list of author names.
|
78
|
+
##
|
79
|
+
def normalize_author(hsh)
|
80
|
+
str = hsh['author']
|
81
|
+
tokens = repair_and_tokenize_author_text(str)
|
82
|
+
authors = []
|
83
|
+
current_auth = []
|
84
|
+
begin_auth = 1
|
85
|
+
tokens.each {|tok|
|
86
|
+
if tok =~ /^(&|and)$/i
|
87
|
+
if !current_auth.empty?
|
88
|
+
auth = normalize_author_name(current_auth)
|
89
|
+
authors << auth
|
90
|
+
end
|
91
|
+
current_auth = []
|
92
|
+
begin_auth = 1
|
93
|
+
next
|
94
|
+
end
|
95
|
+
if begin_auth > 0
|
96
|
+
current_auth << tok
|
97
|
+
begin_auth = 0
|
98
|
+
next
|
99
|
+
end
|
100
|
+
if tok =~ /,$/
|
101
|
+
current_auth << tok
|
102
|
+
if !current_auth.empty?
|
103
|
+
auth = normalize_author_name(current_auth)
|
104
|
+
authors << auth
|
105
|
+
current_auth = []
|
106
|
+
begin_auth = 1
|
107
|
+
end
|
108
|
+
else
|
109
|
+
current_auth << tok
|
110
|
+
end
|
111
|
+
}
|
112
|
+
if !current_auth.empty?
|
113
|
+
auth = normalize_author_name(current_auth)
|
114
|
+
authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
|
115
|
+
end
|
116
|
+
hsh['authors'] = authors if !authors.empty?
|
117
|
+
normalize('author',hsh)
|
118
|
+
hsh
|
119
|
+
end
|
120
|
+
|
121
|
+
def normalize_date(hsh)
|
122
|
+
str = hsh['date']
|
123
|
+
if str =~ /(\d{4})/
|
124
|
+
year = $1.to_i
|
125
|
+
current_year = Time.now.year
|
126
|
+
if year <= current_year+3
|
127
|
+
ret = year
|
128
|
+
hsh['year'] = ret
|
129
|
+
else
|
130
|
+
ret = nil
|
131
|
+
end
|
132
|
+
end
|
133
|
+
hsh['date'] = ret
|
134
|
+
hsh
|
135
|
+
end
|
136
|
+
|
137
|
+
def normalize_volume(hsh)
|
138
|
+
# If there are two numbers, they are volume and number.
|
139
|
+
# e.g. "23(2)", "Vol. 23, No. 3" etc...
|
140
|
+
if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
|
141
|
+
hsh['volume'] = $1
|
142
|
+
hsh['number'] = $2
|
143
|
+
# Otherwise, just pull out a number and hope that it's the volume
|
144
|
+
elsif hsh['volume'] =~ /(\d+)/
|
145
|
+
hsh['volume'] = $1
|
146
|
+
end
|
147
|
+
hsh
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# Normalizes page fields into the form "start--end". If the page
|
152
|
+
# field does not appear to be in a standard form, does nothing.
|
153
|
+
##
|
154
|
+
def normalize_pages(hsh)
|
155
|
+
# "vol.issue (year):pp"
|
156
|
+
case hsh['pages']
|
157
|
+
when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
|
158
|
+
hsh['volume'] = $1
|
159
|
+
hsh['number'] = $2 if $2
|
160
|
+
hsh['year'] = $3 if $3
|
161
|
+
hsh['pages'] = $4
|
162
|
+
end
|
163
|
+
|
164
|
+
case hsh['pages']
|
165
|
+
when /(\d+)[^\d]+(\d+)/
|
166
|
+
hsh['pages'] = "#{$1}--#{$2}"
|
167
|
+
when /(\d+)/
|
168
|
+
hsh['pages'] = $1
|
169
|
+
end
|
170
|
+
hsh
|
171
|
+
end
|
172
|
+
|
173
|
+
def repair_and_tokenize_author_text(author_text)
|
174
|
+
# Repair obvious parse errors and weird notations.
|
175
|
+
author_text.sub!(/et\.? al\.?.*$/, '')
|
176
|
+
# FIXME: maybe I'm mis-understanding Perl regular expressions, but
|
177
|
+
# this pattern from ParseCit appears to do the Wrong Thing:
|
178
|
+
# author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
|
179
|
+
author_text.gsub!(/\(.*?\)/, '')
|
180
|
+
author_text.gsub!(/^.*?\)\.?/, '')
|
181
|
+
author_text.gsub!(/\(.*?$/, '')
|
182
|
+
author_text.gsub!(/\[.*?\]/, '')
|
183
|
+
author_text.gsub!(/^.*?\]\.?/, '')
|
184
|
+
author_text.gsub!(/\[.*?$/, '')
|
185
|
+
author_text.gsub!(/;/, ',')
|
186
|
+
author_text.gsub!(/,/, ', ')
|
187
|
+
author_text.gsub!(/\:/, ' ')
|
188
|
+
author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
|
189
|
+
author_text = join_multi_word_names(author_text)
|
190
|
+
|
191
|
+
orig_tokens = author_text.split(/\s+/)
|
192
|
+
tokens = []
|
193
|
+
last = false
|
194
|
+
orig_tokens.each_with_index {|tok, i|
|
195
|
+
if tok !~ /[A-Za-z&]/
|
196
|
+
if i < orig_tokens.length/2
|
197
|
+
tokens = []
|
198
|
+
next
|
199
|
+
else
|
200
|
+
last = true
|
201
|
+
end
|
202
|
+
end
|
203
|
+
if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
|
204
|
+
tokens.last =~ /\,$/) or
|
205
|
+
tok =~ /^[IVX][IVX]+\.?\,?$/
|
206
|
+
|
207
|
+
next
|
208
|
+
end
|
209
|
+
tokens << tok
|
210
|
+
break if last
|
211
|
+
}
|
212
|
+
tokens
|
213
|
+
end # repair_and_tokenize_author_text
|
214
|
+
|
215
|
+
# Insert underscores to join name particles. i.e.
|
216
|
+
# Jon de Groote ---> Jon de_Groote
|
217
|
+
def join_multi_word_names(author_text)
|
218
|
+
author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
|
219
|
+
"#{$1}_"
|
220
|
+
}
|
221
|
+
end
|
222
|
+
|
223
|
+
##
|
224
|
+
# Tries to normalize an individual author name into the form
|
225
|
+
# "First Middle Last", without punctuation.
|
226
|
+
##
|
227
|
+
def normalize_author_name(auth_toks)
|
228
|
+
return '' if auth_toks.empty?
|
229
|
+
str = auth_toks.join(" ")
|
230
|
+
if str =~ /(.+),\s*(.+)/
|
231
|
+
str = "#{$1} #{$2}"
|
232
|
+
end
|
233
|
+
str.gsub!(/\.\-/, '-')
|
234
|
+
str.gsub!(/[\,\.]/, ' ')
|
235
|
+
str.gsub!(/ +/, ' ')
|
236
|
+
str.strip!
|
237
|
+
|
238
|
+
if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
|
239
|
+
new_toks = str.split(/\s+/)
|
240
|
+
new_order = new_toks[1...new_toks.length];
|
241
|
+
new_order << new_toks[0]
|
242
|
+
str = new_order.join(" ")
|
243
|
+
end
|
244
|
+
|
245
|
+
str.gsub!(/^[^A-Za-z0-9]+/, '')
|
246
|
+
str.gsub!(/[^A-Za-z0-9]+$/, '')
|
247
|
+
return str
|
248
|
+
end
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
|
5
|
+
module Preprocessor
|
6
|
+
|
7
|
+
MARKER_TYPES = {
|
8
|
+
:SQUARE => '\\[.+?\\]',
|
9
|
+
:PAREN => '\\(.+?\\)',
|
10
|
+
:NAKEDNUM => '\\d+',
|
11
|
+
:NAKEDNUMDOT => '\\d+\\.',
|
12
|
+
}
|
13
|
+
|
14
|
+
CLEANUP_RULES_FILE = "#{File.dirname(__FILE__)}/../../config/citation_cleanup_rules.yml"
|
15
|
+
|
16
|
+
def cleanup_rules
|
17
|
+
return @rules if @rules
|
18
|
+
|
19
|
+
raw = YAML.load_file CLEANUP_RULES_FILE
|
20
|
+
@rules = raw['order'].map do |rule_name|
|
21
|
+
re = Regexp.new(raw['rules'][rule_name]['regex'], raw['rules'][rule_name]['ignore_case'])
|
22
|
+
repl = raw['rules'][rule_name]['replacement_str'] || ''
|
23
|
+
{ re: re, repl: repl }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Removes lines that appear to be junk from the citation text,
|
29
|
+
# and applies cleanup regexes from the configuration file.
|
30
|
+
##
|
31
|
+
def normalize_cite_text(cite_text)
|
32
|
+
cite_text.split(/\n/).reject do |line|
|
33
|
+
line.blank? || line =~ /^[\s\d]*$/
|
34
|
+
end.map do |line|
|
35
|
+
normalize_citation(line)
|
36
|
+
end.join("\n")
|
37
|
+
end
|
38
|
+
|
39
|
+
def normalize_citation(cite)
|
40
|
+
cite = cite.dup
|
41
|
+
|
42
|
+
cleanup_rules.each do |rule|
|
43
|
+
cite.gsub!(rule[:re], rule[:repl])
|
44
|
+
end
|
45
|
+
|
46
|
+
cite
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Controls the process by which citations are segmented,
|
51
|
+
# based on the result of trying to guess the type of
|
52
|
+
# citation marker used in the reference section. Returns
|
53
|
+
# a reference to a list of citation objects.
|
54
|
+
##
|
55
|
+
def segment_citations(cite_text)
|
56
|
+
marker_type = guess_marker_type(cite_text)
|
57
|
+
unless marker_type == 'UNKNOWN'
|
58
|
+
citations = split_unmarked_citations(cite_text)
|
59
|
+
else
|
60
|
+
citations = split_citations_by_marker(cite_text, marker_type)
|
61
|
+
end
|
62
|
+
return citations
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Segments citations that have explicit markers in the
|
67
|
+
# reference section. Whenever a new line starts with an
|
68
|
+
# expression that matches what we'd expect of a marker,
|
69
|
+
# a new citation is started. Returns a reference to a
|
70
|
+
# list of citation objects.
|
71
|
+
##
|
72
|
+
def split_citations_by_marker(cite_text, marker_type=nil)
|
73
|
+
citations = []
|
74
|
+
current_citation = Citation.new
|
75
|
+
current_citation_string = nil
|
76
|
+
|
77
|
+
cite_text.split(/\n/).each {|line|
|
78
|
+
if line =~ /^\s*(#{MARKER_TYPES{marker_type}})\s*(.*)$/
|
79
|
+
marker, cite_string = $1, $2
|
80
|
+
if current_citation_string
|
81
|
+
current_citation.citation_string = current_citation_string
|
82
|
+
citations << current_citation
|
83
|
+
current_citation_string = nil
|
84
|
+
end
|
85
|
+
current_citation = Citation.new
|
86
|
+
current_citation.marker_type = marker_type
|
87
|
+
current_citation.marker = marker
|
88
|
+
current_citation_string = cite_string
|
89
|
+
else
|
90
|
+
if current_citation_string =~ /\s\-$/
|
91
|
+
current_citation_string.sub(/\-$/, '')
|
92
|
+
current_citation_string << line
|
93
|
+
else
|
94
|
+
current_citation_string << " " << line
|
95
|
+
end
|
96
|
+
end
|
97
|
+
}
|
98
|
+
|
99
|
+
if current_citation && current_citation_string
|
100
|
+
current_citation.string = current_citation_string
|
101
|
+
citations << current_citation
|
102
|
+
end
|
103
|
+
citations
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|