anystyle-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module Anystyle
4
+ module Parser
5
+
6
+ class Normalizer
7
+
8
+ include Singleton
9
+
10
+ MONTH = Hash.new do |h,k|
11
+ case k
12
+ when /jan/i
13
+ h[k] = 1
14
+ when /feb/i
15
+ h[k] = 2
16
+ when /mar/i
17
+ h[k] = 3
18
+ when /apr/i
19
+ h[k] = 4
20
+ when /ma[yi]/i
21
+ h[k] = 5
22
+ when /jun/i
23
+ h[k] = 6
24
+ when /jul/i
25
+ h[k] = 7
26
+ when /aug/i
27
+ h[k] = 8
28
+ when /sep/i
29
+ h[k] = 9
30
+ when /o[ck]t/i
31
+ h[k] = 10
32
+ when /nov/i
33
+ h[k] = 11
34
+ when /dec/i
35
+ h[k] = 12
36
+ else
37
+ h[k] = nil
38
+ end
39
+ end
40
+
41
+ def method_missing(name, *arguments, &block)
42
+ case name.to_s
43
+ when /^normalize_(.+)$/
44
+ normalize($1.to_sym, *arguments, &block)
45
+ else
46
+ super
47
+ end
48
+ end
49
+
50
+ # Default normalizer. Strips punctuation.
51
+ def normalize(key, hash)
52
+ token, *dangling = hash[key]
53
+ unmatched(key, hash, dangling) unless dangling.empty?
54
+
55
+ token.gsub!(/^\W+|\W+$/, '')
56
+ hash[key] = token
57
+ hash
58
+ rescue => e
59
+ warn e.message
60
+ hash
61
+ end
62
+
63
+ def normalize_author(hash)
64
+ authors, *dangling = hash[:author]
65
+ unmatched(:author, hash, dangling) unless dangling.empty?
66
+
67
+ if authors =~ /\W*[Ee]d(s|itors)?\W*$/ && !hash.has_key?(:editor)
68
+ hash[:editor] = hash.delete(:author)
69
+ normalize_editor(hash)
70
+ else
71
+ hash['more-authors'] = true if !!authors.sub!(/\bet\.?\s*al.*$/i, '')
72
+ authors.gsub!(/^\W+|\W+$/, '')
73
+ hash[:author] = normalize_names(authors)
74
+ end
75
+
76
+ hash
77
+ rescue => e
78
+ warn e.message
79
+ hash
80
+ end
81
+
82
+ def normalize_editor(hash)
83
+ editors, edition = hash[:editor]
84
+
85
+ unless edition.nil?
86
+ if edition =~ /(\d+)/
87
+ hash[:edition] = $1.to_i
88
+ end
89
+ end
90
+
91
+ hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '')
92
+
93
+ editors.gsub!(/^\W+|\W+$/, '')
94
+ editors.gsub!(/^in\s+/i, '')
95
+ editors.gsub!(/\W*[Ee]d(s|itors|ited)?\W*?/, '')
96
+ editors.gsub!(/\bby\b/i, '')
97
+
98
+ is_trans if !!translators.gsub!(/\W*trans(lated)?\W*/i, '')
99
+
100
+ hash[:editor] = normalize_names(editors)
101
+ hash[:translator] = hash[:editor] if is_trans
102
+
103
+ hash
104
+ rescue => e
105
+ warn e.message
106
+ hash
107
+ end
108
+
109
+ def normalize_translator(hash)
110
+ translators = hash[:translator]
111
+
112
+ translators.gsub!(/^\W+|\W+$/, '')
113
+ translators.gsub!(/\W*trans(lated)?\W*/i, '')
114
+ translators.gsub!(/\bby\b/i, '')
115
+
116
+ hash[:translator] = normalize_names(translators)
117
+ hash
118
+ rescue => e
119
+ warn e.message
120
+ hash
121
+ end
122
+
123
+ def normalize_names(names)
124
+ names = tokenize_names(names).map do |name|
125
+ name.strip!
126
+ name.gsub!(/\b([[:upper:]])(\W|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
127
+ name
128
+ end
129
+ names.join(' and ')
130
+ rescue => e
131
+ warn e.message
132
+ hash
133
+ end
134
+
135
+ def tokenize_names(names)
136
+ s, n, ns, cc = StringScanner.new(names), '', [], 0
137
+ until s.eos?
138
+ case
139
+ when s.scan(/,?\s*and\b|&/)
140
+ ns << n
141
+ n, cc = '', 0
142
+ when s.scan(/\s+/)
143
+ n << ' '
144
+ when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
145
+ n << s.matched
146
+ when s.scan(/,/)
147
+ if cc > 0 || n =~ /\w\w+\s+\w\w+/
148
+ ns << n
149
+ n, cc = '', 0
150
+ else
151
+ n << s.matched
152
+ cc += 1
153
+ end
154
+ when s.scan(/\w+/), s.scan(/./)
155
+ n << s.matched
156
+ end
157
+ end
158
+ ns << n
159
+ end
160
+
161
+ def normalize_title(hash)
162
+ title, container = hash[:title]
163
+
164
+ unless container.nil?
165
+ hash[:container] = container
166
+ normalize(:container, hash)
167
+ end
168
+
169
+ extract_edition(title, hash)
170
+
171
+ title.gsub!(/[\.,:;\s]+$/, '')
172
+ title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '')
173
+
174
+ hash[:title] = title
175
+
176
+ hash
177
+ rescue => e
178
+ warn e.message
179
+ hash
180
+ end
181
+
182
+ def extract_edition(token, hash)
183
+ edition = [hash[:edition]].flatten.compact
184
+
185
+ if token.gsub!(/\W*(\d+)(?:st|nd|rd|th)?\s*ed(?:ition|\.)?\W*/i, '')
186
+ edition << $1
187
+ end
188
+
189
+ if token.gsub!(/(?:\band)?\W*([Ee]xpanded)\W*$/, '')
190
+ edition << $1
191
+ end
192
+
193
+ if token.gsub!(/(?:\band)?\W*([Ii]llustrated)\W*$/, '')
194
+ edition << $1
195
+ end
196
+
197
+ if token.gsub!(/(?:\band)?\W*([Rr]evised)\W*$/, '')
198
+ edition << $1
199
+ end
200
+
201
+ if token.gsub!(/(?:\band)?\W*([Rr]eprint)\W*$/, '')
202
+ edition << $1
203
+ end
204
+
205
+ hash[:edition] = edition.join(', ') unless edition.empty?
206
+ end
207
+
208
+ def normalize_booktitle(hash)
209
+ booktitle, *dangling = hash[:booktitle]
210
+ unmatched(:booktitle, hash, dangling) unless dangling.empty?
211
+
212
+ booktitle.gsub!(/^in\s*/i, '')
213
+
214
+ extract_edition(booktitle, hash)
215
+
216
+ booktitle.gsub!(/[\.,:;\s]+$/, '')
217
+ hash[:booktitle] = booktitle
218
+
219
+ hash
220
+ rescue => e
221
+ warn e.message
222
+ hash
223
+ end
224
+
225
+ def normalize_container(hash)
226
+ container, *dangling = hash[:container]
227
+ unmatched(:container, hash, dangling) unless dangling.empty?
228
+
229
+ case container
230
+ when /dissertation abstracts/i
231
+ container.gsub!(/\s*section \w: ([\w\s]+).*$/i, '')
232
+ hash[:category] = $1 unless $1.nil?
233
+ hash[:type] = :phdthesis
234
+ end
235
+
236
+ hash[:container] = container
237
+ hash
238
+ rescue => e
239
+ warn e.message
240
+ hash
241
+ end
242
+
243
+ def normalize_date(hash)
244
+ date, *dangling = hash[:date]
245
+ unmatched(:date, hash, dangling) unless dangling.empty?
246
+
247
+ unless (month = MONTH[date]).nil?
248
+ hash[:month] = month
249
+ end
250
+
251
+ if date =~ /(\d{4})/
252
+ hash[:year] = $1.to_i
253
+ hash.delete(:date)
254
+ end
255
+
256
+ hash
257
+ rescue => e
258
+ warn e.message
259
+ hash
260
+ end
261
+
262
+ def normalize_volume(hash)
263
+ volume, *dangling = hash[:volume]
264
+ unmatched(:volume, hash, dangling) unless dangling.empty?
265
+
266
+ case volume
267
+ when /\D*(\d+)\D+(\d+[\s&-]+\d+)/
268
+ hash[:volume], hash[:number] = $1.to_i, $2
269
+ when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
270
+ hash[:volume] = $1.to_i unless $1.nil?
271
+ hash[:number] = $2
272
+ when /(\d+)?\D+no\.\s*(\d+)/
273
+ hash[:volume] = $1.to_i unless $1.nil?
274
+ hash[:number] = $2.to_i
275
+ when /\D*(\d+)\D+(\d+)/
276
+ hash[:volume], hash[:number] = $1.to_i, $2.to_i
277
+ when /(\d+)/
278
+ hash[:volume] = $1.to_i
279
+ end
280
+
281
+ hash
282
+ rescue => e
283
+ warn e.message
284
+ hash
285
+ end
286
+
287
+ def normalize_pages(hash)
288
+ pages, *dangling = hash[:pages]
289
+ unmatched(:pages, hash, dangling) unless dangling.empty?
290
+
291
+ # "volume.issue(year):pp"
292
+ case pages
293
+ when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
294
+ hash[:volume] = $1.to_i
295
+ hash[:number] = $2.to_i unless $2.nil?
296
+ hash[:year] = $3.to_i unless $3.nil?
297
+ hash[:pages] = $4
298
+ end
299
+
300
+ case hash[:pages]
301
+ when /(\d+)\D+(\d+)/
302
+ hash[:pages] = [$1,$2].join('--')
303
+ when /(\d+)/
304
+ hash[:pages] = $1
305
+ end
306
+
307
+ hash
308
+ rescue => e
309
+ warn e.message
310
+ hash
311
+ end
312
+
313
+ private
314
+
315
+ def unmatched(label, hash, tokens)
316
+ hash["unmatched-#{label}"] = tokens.join(' ')
317
+ end
318
+
319
+ end
320
+
321
+ end
322
+ end
@@ -0,0 +1,240 @@
1
+ module Anystyle
2
+ module Parser
3
+
4
+ class Parser
5
+
6
+ @models = Hash.new { |h,k| k }.merge(
7
+ :anystyle => File.expand_path('../support/anystyle.mod', __FILE__),
8
+ :cora => File.expand_path('../support/cora.mod', __FILE__)
9
+ )
10
+
11
+ @formats = [:bibtex, :hash, :citeproc].freeze
12
+
13
+ @defaults = {
14
+ :model => :anystyle,
15
+ :pattern => File.expand_path('../support/anystyle.pat', __FILE__),
16
+ :separator => /\s+/,
17
+ :tagged_separator => /\s+|(<\/?[^>]+>)/,
18
+ :strip => /\W/,
19
+ :format => :hash
20
+ }.freeze
21
+
22
+ @features = Feature.instances
23
+ @feature = Hash.new { |h,k| h[k.to_sym] = features.detect { |f| f.name == k.to_sym } }
24
+
25
+ class << self
26
+
27
+ attr_reader :defaults, :features, :feature, :models, :formats
28
+
29
+ def load(path)
30
+ p = new
31
+ p.model = Wapiti.load(path)
32
+ p
33
+ end
34
+
35
+ # Returns a default parser instance
36
+ def instance
37
+ @instance ||= new
38
+ end
39
+
40
+ end
41
+
42
+ attr_reader :options
43
+
44
+ attr_accessor :model, :normalizer
45
+
46
+ def initialize(options = {})
47
+ @options = Parser.defaults.merge(options)
48
+ @model = Wapiti.load(Parser.models[@options[:model]])
49
+ @normalizer = Normalizer.instance
50
+ end
51
+
52
+ def parse(input, format = options[:format])
53
+ formatter = "format_#{format}".to_sym
54
+ send(formatter, label(input))
55
+ rescue NoMethodError
56
+ raise ArgumentError, "format not supported: #{formatter}"
57
+ end
58
+
59
+ # Returns an array of label/segment pairs for each line in the passed-in string.
60
+ def label(input, labelled = false)
61
+ string = input_to_s(input)
62
+
63
+ model.label(prepare(string, labelled)).map! do |sequence|
64
+ sequence.inject([]) do |ts, (token, label)|
65
+ token, label = token[/^\S+/], label.to_sym
66
+ if (prev = ts[-1]) && prev[0] == label
67
+ prev[1] << ' ' << token
68
+ ts
69
+ else
70
+ ts << [label, token]
71
+ end
72
+ end
73
+ end
74
+
75
+ end
76
+
77
+ # Returns an array of tokens for each line of input.
78
+ #
79
+ # If the passed-in string is marked as being tagged, extracts labels
80
+ # from the string and returns an array of token/label pairs for each
81
+ # line of input.
82
+ def tokenize(string, tagged = false)
83
+ if tagged
84
+ string.split(/[\n\r]+/).each_with_index.map do |s,i|
85
+ tt, tokens, tags = s.split(options[:tagged_separator]), [], []
86
+
87
+ tt.each do |token|
88
+ case token
89
+ when /^$/
90
+ # skip
91
+ when /^<([^\/>][^>]*)>$/
92
+ tags << $1
93
+ when /^<\/([^>]+)>$/
94
+ unless (tag = tags.pop) == $1
95
+ raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
96
+ end
97
+ else
98
+ tokens << [token, (tags[-1] || :unknown).to_sym]
99
+ end
100
+ end
101
+
102
+ tokens
103
+ end
104
+ else
105
+ string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
106
+ end
107
+ end
108
+
109
+ # Prepares the passed-in string for processing by a CRF tagger. The
110
+ # string is split into separate lines; each line is tokenized and
111
+ # expanded. Returns an array of sequence arrays that can be labelled
112
+ # by the CRF model.
113
+ #
114
+ # If the string is marked as being tagged by passing +true+ as the
115
+ # second argument, training labels will be extracted from the string
116
+ # and appended after feature expansion. The returned sequence arrays
117
+ # can be used for training or testing the CRF model.
118
+ def prepare(input, tagged = false)
119
+ string = input_to_s(input)
120
+ tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
121
+ end
122
+
123
+
124
+ # Expands the passed-in token string by appending a space separated list
125
+ # of all features for the token.
126
+ def expand(token, sequence = [], offset = 0, label = nil)
127
+ f = features_for(token, strip(token), sequence, offset)
128
+ f.unshift(token)
129
+ f.push(label) unless label.nil?
130
+ f.join(' ')
131
+ end
132
+
133
+ def train(input, truncate = false)
134
+ string = input_to_s(input)
135
+ @model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
136
+ @model.train(prepare(string, true))
137
+ @model.compact
138
+ @model.path = Parser.models[options[:model]]
139
+ @model
140
+ end
141
+
142
+ def test(input)
143
+ string = input_to_s(input)
144
+ model.options.check!
145
+ model.label(prepare(string, true))
146
+ end
147
+
148
+ def normalize(hash)
149
+ hash.keys.each do |label|
150
+ normalizer.send("normalize_#{label}", hash)
151
+ end
152
+ classify hash
153
+ end
154
+
155
+ def classify(hash)
156
+ return hash if hash.has_key?(:type)
157
+
158
+ keys = hash.keys
159
+ text = hash.values.flatten.join
160
+
161
+ case
162
+ when keys.include?(:journal)
163
+ hash[:type] = :article
164
+ when text =~ /proceedings/i
165
+ hash[:type] = :inproceedings
166
+ when keys.include?(:booktitle), keys.include?(:container)
167
+ hash[:type] = :incollection
168
+ when keys.include?(:publisher)
169
+ hash[:type] = :book
170
+ when keys.include?(:institution)
171
+ hash[:type] = :techreport
172
+ when keys.include?(:school)
173
+ hash[:type] = :mastersthesis
174
+ when text =~ /unpublished/i
175
+ hash[:type] = :unpublished
176
+ else
177
+ hash[:type] = :misc
178
+ end
179
+
180
+ hash
181
+ end
182
+
183
+ private
184
+
185
+ def input_to_s(input)
186
+ case input
187
+ when String
188
+ if input.length < 128 && File.exists?(input)
189
+ f = File.open(input, 'r:UTF-8')
190
+ f.read
191
+ else
192
+ input
193
+ end
194
+ when Array
195
+ input.join("\n")
196
+ else
197
+ raise ArgumentError, "invalid input: #{input.class}"
198
+ end
199
+ ensure
200
+ f.close if f
201
+ end
202
+
203
+ def features_for(*arguments)
204
+ Parser.features.map { |f| f.match(*arguments) }
205
+ end
206
+
207
+ def strip(token)
208
+ token.gsub(options[:strip], '')
209
+ end
210
+
211
+ def format_bibtex(labels)
212
+ b = BibTeX::Bibliography.new
213
+ format_hash(labels).each do |hash|
214
+ b << BibTeX::Entry.new(hash)
215
+ end
216
+ b
217
+ end
218
+
219
+ def format_hash(labels)
220
+ labels.map do |line|
221
+ hash = line.inject({}) do |h, (label, token)|
222
+ if h.has_key?(label)
223
+ h[label] = [h[label]].flatten << token
224
+ else
225
+ h[label] = token
226
+ end
227
+ h
228
+ end
229
+ normalize hash
230
+ end
231
+ end
232
+
233
+ def format_citeproc(labels)
234
+ format_bibtex(labels).to_citeproc
235
+ end
236
+
237
+ end
238
+
239
+ end
240
+ end