anystyle-parser 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,363 +1,363 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module Anystyle
4
- module Parser
5
-
6
- class Normalizer
7
-
8
- include Singleton
9
-
10
- MONTH = Hash.new do |h,k|
11
- case k
12
- when /jan/i
13
- h[k] = 1
14
- when /feb/i
15
- h[k] = 2
16
- when /mar/i
17
- h[k] = 3
18
- when /apr/i
19
- h[k] = 4
20
- when /ma[yi]/i
21
- h[k] = 5
22
- when /jun/i
23
- h[k] = 6
24
- when /jul/i
25
- h[k] = 7
26
- when /aug/i
27
- h[k] = 8
28
- when /sep/i
29
- h[k] = 9
30
- when /o[ck]t/i
31
- h[k] = 10
32
- when /nov/i
33
- h[k] = 11
34
- when /dec/i
35
- h[k] = 12
36
- else
37
- h[k] = nil
38
- end
39
- end
40
-
41
- def method_missing(name, *arguments, &block)
42
- case name.to_s
43
- when /^normalize_(.+)$/
44
- normalize($1.to_sym, *arguments, &block)
45
- else
46
- super
47
- end
48
- end
49
-
50
- # Default normalizer. Strips punctuation.
51
- def normalize(key, hash)
52
- token, *dangling = hash[key]
53
- unmatched(key, hash, dangling) unless dangling.empty?
54
-
55
- token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
56
- hash[key] = token
57
- hash
58
- rescue => e
59
- warn e.message
60
- hash
61
- end
62
-
63
- def normalize_author(hash)
64
- authors, *dangling = hash[:author]
65
- unmatched(:author, hash, dangling) unless dangling.empty?
66
-
67
- if authors =~ /[^[:alnum:]]*[Ee]d(s|itors)?[^[:alnum:]]*$/ && !hash.has_key?(:editor)
68
- hash[:editor] = hash.delete(:author)
69
- hash = normalize_editor(hash)
70
- else
71
- hash['more-authors'] = true if !!authors.sub!(/\bet\.?\s*al.*$/i, '')
72
- authors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
73
- hash[:author] = normalize_names(authors)
74
- end
75
-
76
- hash
77
- rescue => e
78
- warn e.message
79
- hash
80
- end
81
-
82
- def normalize_editor(hash)
83
- editors, *dangling = hash[:editor]
84
-
85
- unless dangling.empty?
86
- case
87
- when !hash.has_key?(:author)
88
- hash[:author] = editors
89
- hash[:editor] = dangling
90
- hash = normalize_author(hash)
91
- return normalize_editor(hash)
92
- when dangling[0] =~ /(\d+)/
93
- hash[:edition] = $1.to_i
94
- else
95
- unmatched(:editor, hash, dangling)
96
- end
97
- end
98
-
99
- hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '')
100
-
101
- editors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
102
- editors.gsub!(/^in\s+/i, '')
103
- editors.gsub!(/[^[:alpha:]]*[Ee]d(s|itors?|ited)?[^[:alpha:]]*/, '')
104
- editors.gsub!(/[^[:alpha:]]*([Hh]rsg|Herausgeber)[^[:alpha:]]*/, '')
105
- editors.gsub!(/\bby\b/i, '')
106
-
107
- is_trans = !!editors.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
108
-
109
- hash[:editor] = normalize_names(editors)
110
- hash[:translator] = hash[:editor] if is_trans
111
-
112
- hash
113
- rescue => e
114
- warn e.message
115
- hash
116
- end
117
-
118
- def normalize_translator(hash)
119
- translators = hash[:translator]
120
-
121
- translators.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
122
- translators.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
123
- translators.gsub!(/\bby\b/i, '')
124
-
125
- hash[:translator] = normalize_names(translators)
126
- hash
127
- rescue => e
128
- warn e.message
129
- hash
130
- end
131
-
132
- Namae::Parser.instance.options[:prefer_comma_as_separator] = true
133
-
134
- def normalize_names(names)
135
- Namae.parse!(names).map(&:sort_order).join(' and ')
136
- rescue => e
137
- warn e.message
138
- hash
139
- end
140
-
141
- def normalize_title(hash)
142
- title, container = hash[:title]
143
-
144
- unless container.nil?
145
- hash[:container] = container
146
- normalize(:container, hash)
147
- end
148
-
149
- extract_edition(title, hash)
150
-
151
- title.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
152
- title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '')
153
-
154
- hash[:title] = title
155
-
156
- hash
157
- rescue => e
158
- warn e.message
159
- hash
160
- end
161
-
162
- def extract_edition(token, hash)
163
- edition = [hash[:edition]].flatten.compact
164
-
165
- if token.gsub!(/[^[:alnum:]]*(\d+)(?:st|nd|rd|th)?\s*(?:Aufl(?:age|\.)|ed(?:ition|\.)?)[^[:alnum:]]*/i, '')
166
- edition << $1
167
- end
168
-
169
- if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ee]xpanded)[^[:alnum:]]*$/, '')
170
- edition << $1
171
- end
172
-
173
- if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ii]llustrated)[^[:alnum:]]*$/, '')
174
- edition << $1
175
- end
176
-
177
- if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]evised)[^[:alnum:]]*$/, '')
178
- edition << $1
179
- end
180
-
181
- if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]eprint)[^[:alnum:]]*$/, '')
182
- edition << $1
183
- end
184
-
185
- hash[:edition] = edition.join(', ') unless edition.empty?
186
- end
187
-
188
- def normalize_booktitle(hash)
189
- booktitle, *dangling = hash[:booktitle]
190
- unmatched(:booktitle, hash, dangling) unless dangling.empty?
191
-
192
- booktitle.gsub!(/^in\s*/i, '')
193
-
194
- extract_edition(booktitle, hash)
195
-
196
- booktitle.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
197
- hash[:booktitle] = booktitle
198
-
199
- hash
200
- rescue => e
201
- warn e.message
202
- hash
203
- end
204
-
205
- def normalize_journal(hash)
206
- journal, *dangling = hash[:journal]
207
- unmatched(:journal, hash, dangling) unless dangling.empty?
208
-
209
- journal.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
210
- hash[:journal] = journal
211
-
212
- hash
213
- rescue => e
214
- warn e.message
215
- hash
216
- end
217
-
218
- def normalize_container(hash)
219
- container, *dangling = hash[:container]
220
- unmatched(:container, hash, dangling) unless dangling.empty?
221
-
222
- case container
223
- when /dissertation abstracts/i
224
- container.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '')
225
- hash[:category] = $1 unless $1.nil?
226
- hash[:type] = :phdthesis
227
- end
228
-
229
- hash[:container] = container
230
- hash
231
- rescue => e
232
- warn e.message
233
- hash
234
- end
235
-
236
- def normalize_date(hash)
237
- date, *dangling = hash[:date]
238
- unmatched(:date, hash, dangling) unless dangling.empty?
239
-
240
- unless (month = MONTH[date]).nil?
241
- hash[:month] = month
242
- end
243
-
244
- if date =~ /(\d{4})/
245
- hash[:year] = $1.to_i
246
- hash.delete(:date)
247
- end
248
-
249
- hash
250
- rescue => e
251
- warn e.message
252
- hash
253
- end
254
-
255
- def normalize_volume(hash)
256
- volume, *dangling = hash[:volume]
257
- unmatched(:volume, hash, dangling) unless dangling.empty?
258
-
259
- if !hash.has_key?(:pages) && volume =~ /\D*(\d+):(\d+(?:[–-]+)\d+)/
260
- hash[:volume], hash[:pages] = $1.to_i, $2
261
- hash = normalize_pages(hash)
262
- else
263
- case volume
264
- when /\D*(\d+)\D+(\d+[\s&-]+\d+)/
265
- hash[:volume], hash[:number] = $1.to_i, $2
266
- when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
267
- hash[:volume] = $1.to_i unless $1.nil?
268
- hash[:number] = $2
269
- when /(\d+)?\D+no\.\s*(\d+)/
270
- hash[:volume] = $1.to_i unless $1.nil?
271
- hash[:number] = $2.to_i
272
- when /\D*(\d+)\D+(\d+)/
273
- hash[:volume], hash[:number] = $1.to_i, $2.to_i
274
- when /(\d+)/
275
- hash[:volume] = $1.to_i
276
- end
277
- end
278
-
279
- hash
280
- rescue => e
281
- warn e.message
282
- hash
283
- end
284
-
285
- def normalize_pages(hash)
286
- pages, *dangling = hash[:pages]
287
- unmatched(:pages, hash, dangling) unless dangling.empty?
288
-
289
- # "volume.issue(year):pp"
290
- case pages
291
- when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
292
- hash[:volume] = $1.to_i
293
- hash[:number] = $2.to_i unless $2.nil?
294
- hash[:year] = $3.to_i unless $3.nil?
295
- hash[:pages] = $4
296
- end
297
-
298
- case hash[:pages]
299
- when /(\d+)\D+(\d+)/
300
- hash[:pages] = [$1,$2].join('--')
301
- when /(\d+)/
302
- hash[:pages] = $1
303
- end
304
-
305
- hash
306
- rescue => e
307
- warn e.message
308
- hash
309
- end
310
-
311
- def normalize_location(hash)
312
- location, *dangling = hash[:location]
313
- unmatched(:pages, hash, dangling) unless dangling.empty?
314
-
315
- location.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
316
-
317
- if !hash.has_key?(:publisher) && location =~ /:/
318
- location, publisher = location.split(/\s*:\s*/)
319
- hash[:publisher] = publisher
320
- end
321
-
322
- hash[:location] = location
323
- hash
324
- rescue => e
325
- warn e.message
326
- hash
327
- end
328
-
329
- def normalize_isbn(hash)
330
- isbn, *dangling = hash[:isbn]
331
- unmatched(:isbn, hash, dangling) unless dangling.empty?
332
-
333
- isbn = isbn[/[\d-]+/]
334
- hash[:isbn] = isbn
335
-
336
- hash
337
- rescue => e
338
- warn e.message
339
- hash
340
- end
341
-
342
- def normalize_url(hash)
343
- url, *dangling = hash[:url]
344
- unmatched(:url, hash, dangling) unless dangling.empty?
345
-
346
- url.gsub!(/^\s+|[,\s]+$/, '')
347
- hash[:isbn] = isbn
348
- hash
349
- rescue => e
350
- warn e.message
351
- hash
352
- end
353
-
354
- private
355
-
356
- def unmatched(label, hash, tokens)
357
- hash["unmatched-#{label}"] = tokens.join(' ')
358
- end
359
-
360
- end
361
-
362
- end
4
+ module Parser
5
+
6
+ class Normalizer
7
+
8
+ include Singleton
9
+
10
+ MONTH = Hash.new do |h,k|
11
+ case k
12
+ when /jan/i
13
+ h[k] = 1
14
+ when /feb/i
15
+ h[k] = 2
16
+ when /mar/i
17
+ h[k] = 3
18
+ when /apr/i
19
+ h[k] = 4
20
+ when /ma[yi]/i
21
+ h[k] = 5
22
+ when /jun/i
23
+ h[k] = 6
24
+ when /jul/i
25
+ h[k] = 7
26
+ when /aug/i
27
+ h[k] = 8
28
+ when /sep/i
29
+ h[k] = 9
30
+ when /o[ck]t/i
31
+ h[k] = 10
32
+ when /nov/i
33
+ h[k] = 11
34
+ when /dec/i
35
+ h[k] = 12
36
+ else
37
+ h[k] = nil
38
+ end
39
+ end
40
+
41
+ def method_missing(name, *arguments, &block)
42
+ case name.to_s
43
+ when /^normalize_(.+)$/
44
+ normalize($1.to_sym, *arguments, &block)
45
+ else
46
+ super
47
+ end
48
+ end
49
+
50
+ # Default normalizer. Strips punctuation.
51
+ def normalize(key, hash)
52
+ token, *dangling = hash[key]
53
+ unmatched(key, hash, dangling) unless dangling.empty?
54
+
55
+ token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
56
+ hash[key] = token
57
+ hash
58
+ rescue => e
59
+ warn e.message
60
+ hash
61
+ end
62
+
63
+ def normalize_author(hash)
64
+ authors, *dangling = hash[:author]
65
+ unmatched(:author, hash, dangling) unless dangling.empty?
66
+
67
+ if authors =~ /[^[:alnum:]]*[Ee]d(s|itors)?[^[:alnum:]]*$/ && !hash.has_key?(:editor)
68
+ hash[:editor] = hash.delete(:author)
69
+ hash = normalize_editor(hash)
70
+ else
71
+ hash['more-authors'] = true if !!authors.sub!(/\bet\.?\s*al.*$/i, '')
72
+ authors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
73
+ hash[:author] = normalize_names(authors)
74
+ end
75
+
76
+ hash
77
+ rescue => e
78
+ warn e.message
79
+ hash
80
+ end
81
+
82
+ def normalize_editor(hash)
83
+ editors, *dangling = hash[:editor]
84
+
85
+ unless dangling.empty?
86
+ case
87
+ when !hash.has_key?(:author)
88
+ hash[:author] = editors
89
+ hash[:editor] = dangling
90
+ hash = normalize_author(hash)
91
+ return normalize_editor(hash)
92
+ when dangling[0] =~ /(\d+)/
93
+ hash[:edition] = $1.to_i
94
+ else
95
+ unmatched(:editor, hash, dangling)
96
+ end
97
+ end
98
+
99
+ hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '')
100
+
101
+ editors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
102
+ editors.gsub!(/^in\s+/i, '')
103
+ editors.gsub!(/[^[:alpha:]]*[Ee]d(s|itors?|ited)?[^[:alpha:]]*/, '')
104
+ editors.gsub!(/[^[:alpha:]]*([Hh]rsg|Herausgeber)[^[:alpha:]]*/, '')
105
+ editors.gsub!(/\bby\b/i, '')
106
+
107
+ is_trans = !!editors.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
108
+
109
+ hash[:editor] = normalize_names(editors)
110
+ hash[:translator] = hash[:editor] if is_trans
111
+
112
+ hash
113
+ rescue => e
114
+ warn e.message
115
+ hash
116
+ end
117
+
118
+ def normalize_translator(hash)
119
+ translators = hash[:translator]
120
+
121
+ translators.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
122
+ translators.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
123
+ translators.gsub!(/\bby\b/i, '')
124
+
125
+ hash[:translator] = normalize_names(translators)
126
+ hash
127
+ rescue => e
128
+ warn e.message
129
+ hash
130
+ end
131
+
132
+ Namae::Parser.instance.options[:prefer_comma_as_separator] = true
133
+
134
+ def normalize_names(names)
135
+ Namae.parse!(names).map(&:sort_order).join(' and ')
136
+ rescue => e
137
+ warn e.message
138
+ hash
139
+ end
140
+
141
+ def normalize_title(hash)
142
+ title, container = hash[:title]
143
+
144
+ unless container.nil?
145
+ hash[:container] = container
146
+ normalize(:container, hash)
147
+ end
148
+
149
+ extract_edition(title, hash)
150
+
151
+ title.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
152
+ title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '')
153
+
154
+ hash[:title] = title
155
+
156
+ hash
157
+ rescue => e
158
+ warn e.message
159
+ hash
160
+ end
161
+
162
+ def extract_edition(token, hash)
163
+ edition = [hash[:edition]].flatten.compact
164
+
165
+ if token.gsub!(/[^[:alnum:]]*(\d+)(?:st|nd|rd|th)?\s*(?:Aufl(?:age|\.)|ed(?:ition|\.)?)[^[:alnum:]]*/i, '')
166
+ edition << $1
167
+ end
168
+
169
+ if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ee]xpanded)[^[:alnum:]]*$/, '')
170
+ edition << $1
171
+ end
172
+
173
+ if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ii]llustrated)[^[:alnum:]]*$/, '')
174
+ edition << $1
175
+ end
176
+
177
+ if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]evised)[^[:alnum:]]*$/, '')
178
+ edition << $1
179
+ end
180
+
181
+ if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]eprint)[^[:alnum:]]*$/, '')
182
+ edition << $1
183
+ end
184
+
185
+ hash[:edition] = edition.join(', ') unless edition.empty?
186
+ end
187
+
188
+ def normalize_booktitle(hash)
189
+ booktitle, *dangling = hash[:booktitle]
190
+ unmatched(:booktitle, hash, dangling) unless dangling.empty?
191
+
192
+ booktitle.gsub!(/^in\s*/i, '')
193
+
194
+ extract_edition(booktitle, hash)
195
+
196
+ booktitle.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
197
+ hash[:booktitle] = booktitle
198
+
199
+ hash
200
+ rescue => e
201
+ warn e.message
202
+ hash
203
+ end
204
+
205
+ def normalize_journal(hash)
206
+ journal, *dangling = hash[:journal]
207
+ unmatched(:journal, hash, dangling) unless dangling.empty?
208
+
209
+ journal.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
210
+ hash[:journal] = journal
211
+
212
+ hash
213
+ rescue => e
214
+ warn e.message
215
+ hash
216
+ end
217
+
218
+ def normalize_container(hash)
219
+ container, *dangling = hash[:container]
220
+ unmatched(:container, hash, dangling) unless dangling.empty?
221
+
222
+ case container
223
+ when /dissertation abstracts/i
224
+ container.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '')
225
+ hash[:category] = $1 unless $1.nil?
226
+ hash[:type] = :phdthesis
227
+ end
228
+
229
+ hash[:container] = container
230
+ hash
231
+ rescue => e
232
+ warn e.message
233
+ hash
234
+ end
235
+
236
+ def normalize_date(hash)
237
+ date, *dangling = hash[:date]
238
+ unmatched(:date, hash, dangling) unless dangling.empty?
239
+
240
+ unless (month = MONTH[date]).nil?
241
+ hash[:month] = month
242
+ end
243
+
244
+ if date =~ /(\d{4})/
245
+ hash[:year] = $1.to_i
246
+ hash.delete(:date)
247
+ end
248
+
249
+ hash
250
+ rescue => e
251
+ warn e.message
252
+ hash
253
+ end
254
+
255
+ def normalize_volume(hash)
256
+ volume, *dangling = hash[:volume]
257
+ unmatched(:volume, hash, dangling) unless dangling.empty?
258
+
259
+ if !hash.has_key?(:pages) && volume =~ /\D*(\d+):(\d+(?:[—–-]+)\d+)/
260
+ hash[:volume], hash[:pages] = $1.to_i, $2
261
+ hash = normalize_pages(hash)
262
+ else
263
+ case volume
264
+ when /\D*(\d+)\D+(\d+[\s&—–-]+\d+)/
265
+ hash[:volume], hash[:number] = $1.to_i, $2
266
+ when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
267
+ hash[:volume] = $1.to_i unless $1.nil?
268
+ hash[:number] = $2
269
+ when /(\d+)?\D+no\.\s*(\d+)/
270
+ hash[:volume] = $1.to_i unless $1.nil?
271
+ hash[:number] = $2.to_i
272
+ when /\D*(\d+)\D+(\d+)/
273
+ hash[:volume], hash[:number] = $1.to_i, $2.to_i
274
+ when /(\d+)/
275
+ hash[:volume] = $1.to_i
276
+ end
277
+ end
278
+
279
+ hash
280
+ rescue => e
281
+ warn e.message
282
+ hash
283
+ end
284
+
285
+ def normalize_pages(hash)
286
+ pages, *dangling = hash[:pages]
287
+ unmatched(:pages, hash, dangling) unless dangling.empty?
288
+
289
+ # "volume.issue(year):pp"
290
+ case pages
291
+ when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
292
+ hash[:volume] = $1.to_i
293
+ hash[:number] = $2.to_i unless $2.nil?
294
+ hash[:year] = $3.to_i unless $3.nil?
295
+ hash[:pages] = $4
296
+ end
297
+
298
+ case hash[:pages]
299
+ when /(\d+)\D+(\d+)/
300
+ hash[:pages] = [$1,$2].join('') # en-dash
301
+ when /(\d+)/
302
+ hash[:pages] = $1
303
+ end
304
+
305
+ hash
306
+ rescue => e
307
+ warn e.message
308
+ hash
309
+ end
310
+
311
+ def normalize_location(hash)
312
+ location, *dangling = hash[:location]
313
+ unmatched(:pages, hash, dangling) unless dangling.empty?
314
+
315
+ location.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
316
+
317
+ if !hash.has_key?(:publisher) && location =~ /:/
318
+ location, publisher = location.split(/\s*:\s*/)
319
+ hash[:publisher] = publisher
320
+ end
321
+
322
+ hash[:location] = location
323
+ hash
324
+ rescue => e
325
+ warn e.message
326
+ hash
327
+ end
328
+
329
+ def normalize_isbn(hash)
330
+ isbn, *dangling = hash[:isbn]
331
+ unmatched(:isbn, hash, dangling) unless dangling.empty?
332
+
333
+ isbn = isbn[/[\d-]+/]
334
+ hash[:isbn] = isbn
335
+
336
+ hash
337
+ rescue => e
338
+ warn e.message
339
+ hash
340
+ end
341
+
342
+ def normalize_url(hash)
343
+ url, *dangling = hash[:url]
344
+ unmatched(:url, hash, dangling) unless dangling.empty?
345
+
346
+ url.gsub!(/^\s+|[,\s]+$/, '')
347
+ hash[:isbn] = isbn
348
+ hash
349
+ rescue => e
350
+ warn e.message
351
+ hash
352
+ end
353
+
354
+ private
355
+
356
+ def unmatched(label, hash, tokens)
357
+ hash["unmatched-#{label}"] = tokens.join(' ')
358
+ end
359
+
360
+ end
361
+
362
+ end
363
363
  end