anystyle-parser 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -2
- data/HISTORY.md +6 -0
- data/anystyle-parser.gemspec +1 -1
- data/lib/anystyle/parser/features.rb +208 -208
- data/lib/anystyle/parser/normalizer.rb +359 -359
- data/lib/anystyle/parser/parser.rb +28 -10
- data/lib/anystyle/parser/support/anystyle.mod +32347 -5039
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/features_spec.rb +27 -21
- data/spec/anystyle/parser/normalizer_spec.rb +83 -62
- data/spec/anystyle/parser/parser_spec.rb +49 -6
- data/spec/fixtures/train_dps.txt +12 -0
- data/spec/spec_helper.rb +15 -3
- metadata +7 -5
@@ -1,363 +1,363 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
module Anystyle
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
4
|
+
module Parser
|
5
|
+
|
6
|
+
class Normalizer
|
7
|
+
|
8
|
+
include Singleton
|
9
|
+
|
10
|
+
MONTH = Hash.new do |h,k|
|
11
|
+
case k
|
12
|
+
when /jan/i
|
13
|
+
h[k] = 1
|
14
|
+
when /feb/i
|
15
|
+
h[k] = 2
|
16
|
+
when /mar/i
|
17
|
+
h[k] = 3
|
18
|
+
when /apr/i
|
19
|
+
h[k] = 4
|
20
|
+
when /ma[yi]/i
|
21
|
+
h[k] = 5
|
22
|
+
when /jun/i
|
23
|
+
h[k] = 6
|
24
|
+
when /jul/i
|
25
|
+
h[k] = 7
|
26
|
+
when /aug/i
|
27
|
+
h[k] = 8
|
28
|
+
when /sep/i
|
29
|
+
h[k] = 9
|
30
|
+
when /o[ck]t/i
|
31
|
+
h[k] = 10
|
32
|
+
when /nov/i
|
33
|
+
h[k] = 11
|
34
|
+
when /dec/i
|
35
|
+
h[k] = 12
|
36
|
+
else
|
37
|
+
h[k] = nil
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def method_missing(name, *arguments, &block)
|
42
|
+
case name.to_s
|
43
|
+
when /^normalize_(.+)$/
|
44
|
+
normalize($1.to_sym, *arguments, &block)
|
45
|
+
else
|
46
|
+
super
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Default normalizer. Strips punctuation.
|
51
|
+
def normalize(key, hash)
|
52
|
+
token, *dangling = hash[key]
|
53
|
+
unmatched(key, hash, dangling) unless dangling.empty?
|
54
|
+
|
55
|
+
token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
|
56
|
+
hash[key] = token
|
57
|
+
hash
|
58
|
+
rescue => e
|
59
|
+
warn e.message
|
60
|
+
hash
|
61
|
+
end
|
62
|
+
|
63
|
+
def normalize_author(hash)
|
64
|
+
authors, *dangling = hash[:author]
|
65
|
+
unmatched(:author, hash, dangling) unless dangling.empty?
|
66
|
+
|
67
|
+
if authors =~ /[^[:alnum:]]*[Ee]d(s|itors)?[^[:alnum:]]*$/ && !hash.has_key?(:editor)
|
68
|
+
hash[:editor] = hash.delete(:author)
|
69
|
+
hash = normalize_editor(hash)
|
70
|
+
else
|
71
|
+
hash['more-authors'] = true if !!authors.sub!(/\bet\.?\s*al.*$/i, '')
|
72
|
+
authors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
|
73
|
+
hash[:author] = normalize_names(authors)
|
74
|
+
end
|
75
|
+
|
76
|
+
hash
|
77
|
+
rescue => e
|
78
|
+
warn e.message
|
79
|
+
hash
|
80
|
+
end
|
81
|
+
|
82
|
+
def normalize_editor(hash)
|
83
|
+
editors, *dangling = hash[:editor]
|
84
|
+
|
85
|
+
unless dangling.empty?
|
86
|
+
case
|
87
|
+
when !hash.has_key?(:author)
|
88
|
+
hash[:author] = editors
|
89
|
+
hash[:editor] = dangling
|
90
|
+
hash = normalize_author(hash)
|
91
|
+
return normalize_editor(hash)
|
92
|
+
when dangling[0] =~ /(\d+)/
|
93
|
+
hash[:edition] = $1.to_i
|
94
|
+
else
|
95
|
+
unmatched(:editor, hash, dangling)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '')
|
100
|
+
|
101
|
+
editors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
|
102
|
+
editors.gsub!(/^in\s+/i, '')
|
103
|
+
editors.gsub!(/[^[:alpha:]]*[Ee]d(s|itors?|ited)?[^[:alpha:]]*/, '')
|
104
|
+
editors.gsub!(/[^[:alpha:]]*([Hh]rsg|Herausgeber)[^[:alpha:]]*/, '')
|
105
|
+
editors.gsub!(/\bby\b/i, '')
|
106
|
+
|
107
|
+
is_trans = !!editors.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
|
108
|
+
|
109
|
+
hash[:editor] = normalize_names(editors)
|
110
|
+
hash[:translator] = hash[:editor] if is_trans
|
111
|
+
|
112
|
+
hash
|
113
|
+
rescue => e
|
114
|
+
warn e.message
|
115
|
+
hash
|
116
|
+
end
|
117
|
+
|
118
|
+
def normalize_translator(hash)
|
119
|
+
translators = hash[:translator]
|
120
|
+
|
121
|
+
translators.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
|
122
|
+
translators.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
|
123
|
+
translators.gsub!(/\bby\b/i, '')
|
124
|
+
|
125
|
+
hash[:translator] = normalize_names(translators)
|
126
|
+
hash
|
127
|
+
rescue => e
|
128
|
+
warn e.message
|
129
|
+
hash
|
130
|
+
end
|
131
|
+
|
132
|
+
Namae::Parser.instance.options[:prefer_comma_as_separator] = true
|
133
|
+
|
134
|
+
def normalize_names(names)
|
135
|
+
Namae.parse!(names).map(&:sort_order).join(' and ')
|
136
|
+
rescue => e
|
137
|
+
warn e.message
|
138
|
+
hash
|
139
|
+
end
|
140
|
+
|
141
|
+
def normalize_title(hash)
|
142
|
+
title, container = hash[:title]
|
143
|
+
|
144
|
+
unless container.nil?
|
145
|
+
hash[:container] = container
|
146
|
+
normalize(:container, hash)
|
147
|
+
end
|
148
|
+
|
149
|
+
extract_edition(title, hash)
|
150
|
+
|
151
|
+
title.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
|
152
|
+
title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '')
|
153
|
+
|
154
|
+
hash[:title] = title
|
155
|
+
|
156
|
+
hash
|
157
|
+
rescue => e
|
158
|
+
warn e.message
|
159
|
+
hash
|
160
|
+
end
|
161
|
+
|
162
|
+
def extract_edition(token, hash)
|
163
|
+
edition = [hash[:edition]].flatten.compact
|
164
|
+
|
165
|
+
if token.gsub!(/[^[:alnum:]]*(\d+)(?:st|nd|rd|th)?\s*(?:Aufl(?:age|\.)|ed(?:ition|\.)?)[^[:alnum:]]*/i, '')
|
166
|
+
edition << $1
|
167
|
+
end
|
168
|
+
|
169
|
+
if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ee]xpanded)[^[:alnum:]]*$/, '')
|
170
|
+
edition << $1
|
171
|
+
end
|
172
|
+
|
173
|
+
if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ii]llustrated)[^[:alnum:]]*$/, '')
|
174
|
+
edition << $1
|
175
|
+
end
|
176
|
+
|
177
|
+
if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]evised)[^[:alnum:]]*$/, '')
|
178
|
+
edition << $1
|
179
|
+
end
|
180
|
+
|
181
|
+
if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]eprint)[^[:alnum:]]*$/, '')
|
182
|
+
edition << $1
|
183
|
+
end
|
184
|
+
|
185
|
+
hash[:edition] = edition.join(', ') unless edition.empty?
|
186
|
+
end
|
187
|
+
|
188
|
+
def normalize_booktitle(hash)
|
189
|
+
booktitle, *dangling = hash[:booktitle]
|
190
|
+
unmatched(:booktitle, hash, dangling) unless dangling.empty?
|
191
|
+
|
192
|
+
booktitle.gsub!(/^in\s*/i, '')
|
193
|
+
|
194
|
+
extract_edition(booktitle, hash)
|
195
|
+
|
196
|
+
booktitle.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
|
197
|
+
hash[:booktitle] = booktitle
|
198
|
+
|
199
|
+
hash
|
200
|
+
rescue => e
|
201
|
+
warn e.message
|
202
|
+
hash
|
203
|
+
end
|
204
|
+
|
205
|
+
def normalize_journal(hash)
|
206
|
+
journal, *dangling = hash[:journal]
|
207
|
+
unmatched(:journal, hash, dangling) unless dangling.empty?
|
208
|
+
|
209
|
+
journal.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
|
210
|
+
hash[:journal] = journal
|
211
|
+
|
212
|
+
hash
|
213
|
+
rescue => e
|
214
|
+
warn e.message
|
215
|
+
hash
|
216
|
+
end
|
217
|
+
|
218
|
+
def normalize_container(hash)
|
219
|
+
container, *dangling = hash[:container]
|
220
|
+
unmatched(:container, hash, dangling) unless dangling.empty?
|
221
|
+
|
222
|
+
case container
|
223
|
+
when /dissertation abstracts/i
|
224
|
+
container.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '')
|
225
|
+
hash[:category] = $1 unless $1.nil?
|
226
|
+
hash[:type] = :phdthesis
|
227
|
+
end
|
228
|
+
|
229
|
+
hash[:container] = container
|
230
|
+
hash
|
231
|
+
rescue => e
|
232
|
+
warn e.message
|
233
|
+
hash
|
234
|
+
end
|
235
|
+
|
236
|
+
def normalize_date(hash)
|
237
|
+
date, *dangling = hash[:date]
|
238
|
+
unmatched(:date, hash, dangling) unless dangling.empty?
|
239
|
+
|
240
|
+
unless (month = MONTH[date]).nil?
|
241
|
+
hash[:month] = month
|
242
|
+
end
|
243
|
+
|
244
|
+
if date =~ /(\d{4})/
|
245
|
+
hash[:year] = $1.to_i
|
246
|
+
hash.delete(:date)
|
247
|
+
end
|
248
|
+
|
249
|
+
hash
|
250
|
+
rescue => e
|
251
|
+
warn e.message
|
252
|
+
hash
|
253
|
+
end
|
254
|
+
|
255
|
+
def normalize_volume(hash)
|
256
|
+
volume, *dangling = hash[:volume]
|
257
|
+
unmatched(:volume, hash, dangling) unless dangling.empty?
|
258
|
+
|
259
|
+
if !hash.has_key?(:pages) && volume =~ /\D*(\d+):(\d+(?:[—–-]+)\d+)/
|
260
|
+
hash[:volume], hash[:pages] = $1.to_i, $2
|
261
|
+
hash = normalize_pages(hash)
|
262
|
+
else
|
263
|
+
case volume
|
264
|
+
when /\D*(\d+)\D+(\d+[\s&—–-]+\d+)/
|
265
|
+
hash[:volume], hash[:number] = $1.to_i, $2
|
266
|
+
when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
|
267
|
+
hash[:volume] = $1.to_i unless $1.nil?
|
268
|
+
hash[:number] = $2
|
269
|
+
when /(\d+)?\D+no\.\s*(\d+)/
|
270
|
+
hash[:volume] = $1.to_i unless $1.nil?
|
271
|
+
hash[:number] = $2.to_i
|
272
|
+
when /\D*(\d+)\D+(\d+)/
|
273
|
+
hash[:volume], hash[:number] = $1.to_i, $2.to_i
|
274
|
+
when /(\d+)/
|
275
|
+
hash[:volume] = $1.to_i
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
hash
|
280
|
+
rescue => e
|
281
|
+
warn e.message
|
282
|
+
hash
|
283
|
+
end
|
284
|
+
|
285
|
+
def normalize_pages(hash)
|
286
|
+
pages, *dangling = hash[:pages]
|
287
|
+
unmatched(:pages, hash, dangling) unless dangling.empty?
|
288
|
+
|
289
|
+
# "volume.issue(year):pp"
|
290
|
+
case pages
|
291
|
+
when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
|
292
|
+
hash[:volume] = $1.to_i
|
293
|
+
hash[:number] = $2.to_i unless $2.nil?
|
294
|
+
hash[:year] = $3.to_i unless $3.nil?
|
295
|
+
hash[:pages] = $4
|
296
|
+
end
|
297
|
+
|
298
|
+
case hash[:pages]
|
299
|
+
when /(\d+)\D+(\d+)/
|
300
|
+
hash[:pages] = [$1,$2].join('–') # en-dash
|
301
|
+
when /(\d+)/
|
302
|
+
hash[:pages] = $1
|
303
|
+
end
|
304
|
+
|
305
|
+
hash
|
306
|
+
rescue => e
|
307
|
+
warn e.message
|
308
|
+
hash
|
309
|
+
end
|
310
|
+
|
311
|
+
def normalize_location(hash)
|
312
|
+
location, *dangling = hash[:location]
|
313
|
+
unmatched(:pages, hash, dangling) unless dangling.empty?
|
314
|
+
|
315
|
+
location.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
|
316
|
+
|
317
|
+
if !hash.has_key?(:publisher) && location =~ /:/
|
318
|
+
location, publisher = location.split(/\s*:\s*/)
|
319
|
+
hash[:publisher] = publisher
|
320
|
+
end
|
321
|
+
|
322
|
+
hash[:location] = location
|
323
|
+
hash
|
324
|
+
rescue => e
|
325
|
+
warn e.message
|
326
|
+
hash
|
327
|
+
end
|
328
|
+
|
329
|
+
def normalize_isbn(hash)
|
330
|
+
isbn, *dangling = hash[:isbn]
|
331
|
+
unmatched(:isbn, hash, dangling) unless dangling.empty?
|
332
|
+
|
333
|
+
isbn = isbn[/[\d-]+/]
|
334
|
+
hash[:isbn] = isbn
|
335
|
+
|
336
|
+
hash
|
337
|
+
rescue => e
|
338
|
+
warn e.message
|
339
|
+
hash
|
340
|
+
end
|
341
|
+
|
342
|
+
def normalize_url(hash)
|
343
|
+
url, *dangling = hash[:url]
|
344
|
+
unmatched(:url, hash, dangling) unless dangling.empty?
|
345
|
+
|
346
|
+
url.gsub!(/^\s+|[,\s]+$/, '')
|
347
|
+
hash[:isbn] = isbn
|
348
|
+
hash
|
349
|
+
rescue => e
|
350
|
+
warn e.message
|
351
|
+
hash
|
352
|
+
end
|
353
|
+
|
354
|
+
private
|
355
|
+
|
356
|
+
def unmatched(label, hash, tokens)
|
357
|
+
hash["unmatched-#{label}"] = tokens.join(' ')
|
358
|
+
end
|
359
|
+
|
360
|
+
end
|
361
|
+
|
362
|
+
end
|
363
363
|
end
|