anystyle-parser 0.6.9 → 0.6.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/README.md +3 -1
- data/anystyle-parser.gemspec +1 -1
- data/lib/anystyle/parser/normalizer.rb +21 -14
- data/lib/anystyle/parser/parser.rb +18 -6
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/dictionary_spec.rb +6 -6
- data/spec/anystyle/parser/features_spec.rb +3 -3
- data/spec/anystyle/parser/normalizer_spec.rb +58 -48
- data/spec/anystyle/parser/parser_spec.rb +40 -32
- metadata +4 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 978c32592fd4e4edade7f5648a1b3095d652a68b
|
4
|
+
data.tar.gz: e310a276de3bb0e4c94ca4df72ed8dda378a9fd6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8ef818cc5486835a51de69f4cc30d46975e162c0b0b8a7cd2d08ab3e94a3956e7ffecaa1f6e4755a29b186d3f9df56b89bd6e554e8defc64f8da7e2a1fc50c8f
|
7
|
+
data.tar.gz: ae731f1ace948eafa16fe20bf921b48debe52f4656368f4da3ae16b2b8a6612bd2d7b09b728209558b9a27fd2f734e65752012fd714d6521983d98ff0ba96245
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -17,7 +17,9 @@ CSL/CiteProc JSON format).
|
|
17
17
|
Web Application and Web Service
|
18
18
|
-------------------------------
|
19
19
|
Anystyle-Parser is avaialble as a web application and a web service at
|
20
|
-
[http://anystyle.io](http://anystyle.io).
|
20
|
+
[http://anystyle.io](http://anystyle.io). For example Ruby code using
|
21
|
+
the anystyle.io API, see this [prototype](https://gist.github.com/inukshuk/f1d47aeab1f778bca8ce)
|
22
|
+
for a style predictor.
|
21
23
|
|
22
24
|
Installation
|
23
25
|
------------
|
data/anystyle-parser.gemspec
CHANGED
@@ -20,7 +20,7 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.add_runtime_dependency('bibtex-ruby', '~>3.0')
|
21
21
|
s.add_runtime_dependency('builder', '>=3.0', '<4.0')
|
22
22
|
s.add_runtime_dependency('wapiti', '~>0.1')
|
23
|
-
s.add_runtime_dependency('namae', '~>0.
|
23
|
+
s.add_runtime_dependency('namae', '~>0.9')
|
24
24
|
|
25
25
|
s.files = `git ls-files`.split("\n").reject { |path|
|
26
26
|
path.start_with?('.')
|
@@ -58,6 +58,16 @@ module Anystyle
|
|
58
58
|
hash
|
59
59
|
end
|
60
60
|
|
61
|
+
def normalize_accessed(hash)
|
62
|
+
token, *dangling = hash[:accessed]
|
63
|
+
unmatched(:accessed, hash, dangling) unless dangling.empty?
|
64
|
+
|
65
|
+
token.gsub!(/(accessed|retrieved):?\s*/i, '')
|
66
|
+
|
67
|
+
hash[:accessed] = token
|
68
|
+
hash
|
69
|
+
end
|
70
|
+
|
61
71
|
def normalize_key(hash)
|
62
72
|
token, *dangling = hash[:key]
|
63
73
|
unmatched(:key, hash, dangling) unless dangling.empty?
|
@@ -114,8 +124,8 @@ module Anystyle
|
|
114
124
|
|
115
125
|
editors.gsub!(/^\W+|\W+$/, '')
|
116
126
|
editors.gsub!(/^in:?\s+/i, '')
|
117
|
-
editors.gsub!(
|
118
|
-
editors.gsub!(
|
127
|
+
editors.gsub!(/\W*\b[Ee]d(s|itors?|ited)?\b\W*/, '')
|
128
|
+
editors.gsub!(/\W*\b([Hh]rsg|gg?|Herausgeber)\b\W*/, '')
|
119
129
|
editors.gsub!(/\b[Hh]erausgegeben von\b/, '')
|
120
130
|
editors.gsub!(/\bby\b/i, '')
|
121
131
|
|
@@ -134,8 +144,9 @@ module Anystyle
|
|
134
144
|
def normalize_translator(hash)
|
135
145
|
translators = hash[:translator]
|
136
146
|
|
147
|
+
translators.gsub!(/\b([Ii]n (d|ein)er )?[Üü]ber(s\.|setzt|setzung|tragen|tragung) v(\.|on\b)/, '')
|
137
148
|
translators.gsub!(/^\W+|\W+$/, '')
|
138
|
-
translators.gsub!(/[^[:alpha:]]
|
149
|
+
translators.gsub!(/[^[:alpha:]]*\btrans(l(ated)?)?\b[^[:alpha:]]*/i, '')
|
139
150
|
translators.gsub!(/\bby\b/i, '')
|
140
151
|
|
141
152
|
hash[:translator] = normalize_names(translators)
|
@@ -170,10 +181,7 @@ module Anystyle
|
|
170
181
|
names.gsub!(/;|:/, ',')
|
171
182
|
|
172
183
|
Namae.parse!(names).map { |name|
|
173
|
-
|
174
|
-
name.given.gsub!(/\b([[:upper:]])(\s|$)/, '\1.\2')
|
175
|
-
end
|
176
|
-
|
184
|
+
name.normalize_initials
|
177
185
|
name.sort_order
|
178
186
|
|
179
187
|
}.join(' and ')
|
@@ -270,21 +278,20 @@ module Anystyle
|
|
270
278
|
end
|
271
279
|
|
272
280
|
def normalize_date(hash)
|
273
|
-
date
|
274
|
-
unmatched(:date, hash, dangling) unless dangling.empty?
|
281
|
+
date = Array(hash[:date]).join(' ')
|
275
282
|
|
276
283
|
unless (month = MONTH[date]).nil?
|
277
|
-
|
284
|
+
month = '%02d' % month
|
278
285
|
end
|
279
286
|
|
280
287
|
if date =~ /(\d{4})/
|
281
|
-
|
288
|
+
year = $1
|
282
289
|
|
283
|
-
if
|
284
|
-
|
290
|
+
if month && date =~ /\b(\d{1,2})\b/
|
291
|
+
day = '%02d' % $1.to_i
|
285
292
|
end
|
286
293
|
|
287
|
-
hash.
|
294
|
+
hash[:date] = [year, month, day].compact.join('-')
|
288
295
|
end
|
289
296
|
|
290
297
|
hash
|
@@ -3,17 +3,17 @@ module Anystyle
|
|
3
3
|
|
4
4
|
class Parser
|
5
5
|
|
6
|
-
@formats = [:bibtex, :hash, :citeproc, :xml, :tags, :raw].freeze
|
6
|
+
@formats = [:bibtex, :hash, :normalized, :citeproc, :xml, :tags, :raw].freeze
|
7
7
|
|
8
8
|
@defaults = {
|
9
9
|
:model => File.expand_path('../support/anystyle.mod', __FILE__),
|
10
10
|
:pattern => File.expand_path('../support/anystyle.pat', __FILE__),
|
11
11
|
:compact => true,
|
12
12
|
:threads => 4,
|
13
|
-
:separator => /\s+|\b(\d
|
13
|
+
:separator => /\s+|\b(\d\S*:)/,
|
14
14
|
:tagged_separator => /\s+|(<\/?[^>]+>)/,
|
15
15
|
:strip => /[^[:alnum:]]/,
|
16
|
-
:format => :
|
16
|
+
:format => :normalized,
|
17
17
|
:xml_entities => Hash[*%w{ & & < < > > ' ' " " }],
|
18
18
|
:training_data => File.expand_path('../../../../resources/train.txt', __FILE__)
|
19
19
|
}.freeze
|
@@ -258,8 +258,17 @@ module Anystyle
|
|
258
258
|
|
259
259
|
def format_bibtex(labels)
|
260
260
|
b = BibTeX::Bibliography.new
|
261
|
-
|
261
|
+
format_normalized(labels).each do |hash|
|
262
262
|
hash[:address] = hash.delete :location if hash.key?(:location)
|
263
|
+
hash[:urldate] = hash.delete :accessed if hash.key?(:accessed)
|
264
|
+
|
265
|
+
if hash.key?(:authority)
|
266
|
+
if [:techreport,:thesis].include?(hash[:type])
|
267
|
+
hash[:institution] = hash.delete :authority
|
268
|
+
else
|
269
|
+
hash[:organization] = hash.delete :authority
|
270
|
+
end
|
271
|
+
end
|
263
272
|
|
264
273
|
b << BibTeX::Entry.new(hash)
|
265
274
|
end
|
@@ -276,7 +285,7 @@ module Anystyle
|
|
276
285
|
|
277
286
|
def format_hash(labels)
|
278
287
|
labels.map do |line|
|
279
|
-
|
288
|
+
line.inject({}) do |h, (label, token)|
|
280
289
|
if h.has_key?(label)
|
281
290
|
h[label] = [h[label]].flatten << token
|
282
291
|
else
|
@@ -284,10 +293,13 @@ module Anystyle
|
|
284
293
|
end
|
285
294
|
h
|
286
295
|
end
|
287
|
-
normalize hash
|
288
296
|
end
|
289
297
|
end
|
290
298
|
|
299
|
+
def format_normalized(labels)
|
300
|
+
format_hash(labels).map { |h| normalize h }
|
301
|
+
end
|
302
|
+
|
291
303
|
def format_citeproc(labels)
|
292
304
|
format_bibtex(labels).to_citeproc
|
293
305
|
end
|
@@ -7,16 +7,16 @@ module Anystyle
|
|
7
7
|
|
8
8
|
let(:dict) { Dictionary.instance }
|
9
9
|
|
10
|
-
it { Dictionary.
|
11
|
-
it { dict.
|
10
|
+
it { expect(Dictionary).not_to respond_to(:new) }
|
11
|
+
it { expect(dict).not_to be nil }
|
12
12
|
|
13
13
|
describe '.modes' do
|
14
14
|
it 'returns an array' do
|
15
|
-
Dictionary.modes.
|
15
|
+
expect(Dictionary.modes).to be_a(Array)
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'contains at least :hash' do
|
19
|
-
Dictionary.modes.
|
19
|
+
expect(Dictionary.modes).to include(:hash)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -24,12 +24,12 @@ module Anystyle
|
|
24
24
|
|
25
25
|
%w{ philippines italy }.each do |place|
|
26
26
|
it "#{place.inspect} should be a place name" do
|
27
|
-
dict[place].
|
27
|
+
expect(dict[place]).to eq(Dictionary.code[:place])
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
31
|
it "accepts unicode strins like 'çela' (surname)" do
|
32
|
-
(dict['çela'] & Dictionary.code[:surname]).
|
32
|
+
expect(dict['çela'] & Dictionary.code[:surname]).to be > 0
|
33
33
|
end
|
34
34
|
|
35
35
|
end
|
@@ -8,19 +8,19 @@ module Anystyle::Parser
|
|
8
8
|
|
9
9
|
%w{ (1992) 1992 2011 1776 }.each do |year|
|
10
10
|
it "returns :year for #{year.inspect}" do
|
11
|
-
f.match(year).
|
11
|
+
expect(f.match(year)).to eq(:year)
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
15
|
%w{ (1) (12) (123) }.each do |year|
|
16
16
|
it "returns :year for #{year.inspect}" do
|
17
|
-
f.match(year).
|
17
|
+
expect(f.match(year)).to eq(:numeric)
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
21
|
['pp', 'pp.', '23-4', '6124--19', '48 - 9', '19–27'].each do |page|
|
22
22
|
it "returns :page for #{page.inspect}" do
|
23
|
-
f.match(page).
|
23
|
+
expect(f.match(page)).to eq(:page)
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
@@ -9,52 +9,56 @@ module Anystyle
|
|
9
9
|
describe "#tokenize_names" do
|
10
10
|
|
11
11
|
it "tokenizes 'A B'" do
|
12
|
-
Normalizer.instance.normalize_names('A B').
|
12
|
+
expect(Normalizer.instance.normalize_names('A B')).to eq('B, A.')
|
13
13
|
end
|
14
14
|
|
15
15
|
it "tokenizes 'A, B'" do
|
16
|
-
Normalizer.instance.normalize_names('A, B').
|
16
|
+
expect(Normalizer.instance.normalize_names('A, B')).to eq('A, B.')
|
17
17
|
end
|
18
18
|
|
19
19
|
it "tokenizes 'A, jr., Bbb'" do
|
20
|
-
Normalizer.instance.normalize_names('A, jr., B').
|
20
|
+
expect(Normalizer.instance.normalize_names('A, jr., B')).to eq('A, jr., B.')
|
21
21
|
end
|
22
22
|
|
23
23
|
it "tokenizes 'A, B, jr.'" do
|
24
|
-
Normalizer.instance.normalize_names('A, B, jr.').
|
24
|
+
expect(Normalizer.instance.normalize_names('A, B, jr.')).to eq('A, jr., B.')
|
25
25
|
end
|
26
26
|
|
27
27
|
it "tokenizes 'A, B, C, D'" do
|
28
|
-
Normalizer.instance.normalize_names('A, B, C, D').
|
28
|
+
expect(Normalizer.instance.normalize_names('A, B, C, D')).to eq('A, B. and C, D.')
|
29
29
|
end
|
30
30
|
|
31
31
|
it "tokenizes 'A, B, C'" do
|
32
|
-
Normalizer.instance.normalize_names('A, B, C').
|
32
|
+
expect(Normalizer.instance.normalize_names('A, B, C')).to eq('A, B. and C.')
|
33
33
|
end
|
34
34
|
|
35
35
|
it "tokenizes 'Aa Bb, C.'" do
|
36
|
-
Normalizer.instance.normalize_names('Aa Bb, C.').
|
36
|
+
expect(Normalizer.instance.normalize_names('Aa Bb, C.')).to eq('Aa Bb, C.')
|
37
37
|
end
|
38
38
|
|
39
39
|
it "tokenizes 'Plath, L.C., Asgaard, G., ... Botros, N.'" do
|
40
|
-
Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., ... Botros, N.').
|
41
|
-
Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., … Botros, N.').
|
40
|
+
expect(Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., ... Botros, N.')).to eq('Plath, L.C. and Asgaard, G. and Botros, N.')
|
41
|
+
expect(Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., … Botros, N.')).to eq('Plath, L.C. and Asgaard, G. and Botros, N.')
|
42
42
|
end
|
43
43
|
|
44
44
|
it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
|
45
|
-
Normalizer.instance.normalize_names('Aa Bb, Cc Dd, and E F G').
|
45
|
+
expect(Normalizer.instance.normalize_names('Aa Bb, Cc Dd, and E F G')).to eq('Bb, Aa and Dd, Cc and G, E.F.')
|
46
46
|
end
|
47
47
|
|
48
48
|
[
|
49
49
|
['Poe, Edgar A.', 'Poe, Edgar A.'],
|
50
50
|
['Edgar A. Poe', 'Poe, Edgar A.'],
|
51
|
+
['J Doe', 'Doe, J.'],
|
52
|
+
['Doe, J', 'Doe, J.'],
|
53
|
+
['JE Doe', 'Doe, J.E.'],
|
54
|
+
['Doe, JE', 'Doe, J.E.'],
|
51
55
|
['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
|
52
56
|
['Edgar A. Poe; Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
|
53
57
|
['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
|
54
58
|
['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
|
55
59
|
].each do |name, normalized|
|
56
60
|
it "tokenizes #{name.inspect}" do
|
57
|
-
Normalizer.instance.normalize_names(name).
|
61
|
+
expect(Normalizer.instance.normalize_names(name)).to eq(normalized)
|
58
62
|
end
|
59
63
|
end
|
60
64
|
|
@@ -62,75 +66,81 @@ module Anystyle
|
|
62
66
|
|
63
67
|
describe '#normalize_editor' do
|
64
68
|
it "strips in from beginning" do
|
65
|
-
n.normalize_editor(:editor => 'In D. Knuth (ed.)').
|
66
|
-
n.normalize_editor(:editor => 'In: D. Knuth (ed.)').
|
67
|
-
n.normalize_editor(:editor => 'in: D. Knuth ed.').
|
68
|
-
n.normalize_editor(:editor => 'in D. Knuth (ed)').
|
69
|
+
expect(n.normalize_editor(:editor => 'In D. Knuth (ed.)')).to eq({ :editor => 'Knuth, D.' })
|
70
|
+
expect(n.normalize_editor(:editor => 'In: D. Knuth (ed.)')).to eq({ :editor => 'Knuth, D.' })
|
71
|
+
expect(n.normalize_editor(:editor => 'in: D. Knuth ed.')).to eq({ :editor => 'Knuth, D.' })
|
72
|
+
expect(n.normalize_editor(:editor => 'in D. Knuth (ed)')).to eq({ :editor => 'Knuth, D.' })
|
69
73
|
end
|
70
74
|
|
71
75
|
it "does not strip ed from name" do
|
72
|
-
n.normalize_editor(:editor => 'In Edward Wood').
|
73
|
-
n.normalize_editor(:editor => 'ed by Edward Wood').
|
74
|
-
n.normalize_editor(:editor => 'ed. by Edward Wood').
|
75
|
-
n.normalize_editor(:editor => 'ed by Edward Wood').
|
76
|
+
expect(n.normalize_editor(:editor => 'In Edward Wood')).to eq({ :editor => 'Wood, Edward' })
|
77
|
+
expect(n.normalize_editor(:editor => 'ed by Edward Wood')).to eq({ :editor => 'Wood, Edward' })
|
78
|
+
expect(n.normalize_editor(:editor => 'ed. by Edward Wood')).to eq({ :editor => 'Wood, Edward' })
|
79
|
+
expect(n.normalize_editor(:editor => 'ed by Edward Wood')).to eq({ :editor => 'Wood, Edward' })
|
80
|
+
expect(n.normalize_editor(:editor => 'In Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
|
81
|
+
expect(n.normalize_editor(:editor => 'ed by Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
|
82
|
+
expect(n.normalize_editor(:editor => 'ed. by Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
|
83
|
+
expect(n.normalize_editor(:editor => 'ed by Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
|
76
84
|
end
|
77
85
|
|
78
86
|
it "strips et al" do
|
79
|
-
n.normalize_editor(:editor => 'Edward Wood et al')[:editor].
|
80
|
-
n.normalize_editor(:editor => 'Edward Wood et al.')[:editor].
|
81
|
-
n.normalize_editor(:editor => 'Edward Wood u.a.')[:editor].
|
82
|
-
n.normalize_editor(:editor => 'Edward Wood u. a.')[:editor].
|
83
|
-
n.normalize_editor(:editor => 'Edward Wood and others')[:editor].
|
84
|
-
n.normalize_editor(:editor => 'Edward Wood & others')[:editor].
|
87
|
+
expect(n.normalize_editor(:editor => 'Edward Wood et al')[:editor]).to eq('Wood, Edward')
|
88
|
+
expect(n.normalize_editor(:editor => 'Edward Wood et al.')[:editor]).to eq('Wood, Edward')
|
89
|
+
expect(n.normalize_editor(:editor => 'Edward Wood u.a.')[:editor]).to eq('Wood, Edward')
|
90
|
+
expect(n.normalize_editor(:editor => 'Edward Wood u. a.')[:editor]).to eq('Wood, Edward')
|
91
|
+
expect(n.normalize_editor(:editor => 'Edward Wood and others')[:editor]).to eq('Wood, Edward')
|
92
|
+
expect(n.normalize_editor(:editor => 'Edward Wood & others')[:editor]).to eq('Wood, Edward')
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#normalize_translator' do
|
97
|
+
it "strips in from beginning" do
|
98
|
+
expect(n.normalize_translator(:translator => 'Translated by J Doe')).to eq({ :translator => 'Doe, J.' })
|
99
|
+
expect(n.normalize_translator(:translator => 'Trans by J Doe')).to eq({ :translator => 'Doe, J.' })
|
100
|
+
expect(n.normalize_translator(:translator => 'Trans. by J Doe')).to eq({ :translator => 'Doe, J.' })
|
101
|
+
expect(n.normalize_translator(:translator => 'Transl. J Doe')).to eq({ :translator => 'Doe, J.' })
|
102
|
+
expect(n.normalize_translator(:translator => 'übersetzt von J Doe')).to eq({ :translator => 'Doe, J.' })
|
103
|
+
expect(n.normalize_translator(:translator => 'übers. v. J Doe')).to eq({ :translator => 'Doe, J.' })
|
104
|
+
expect(n.normalize_translator(:translator => 'Übersetzung v. J Doe')).to eq({ :translator => 'Doe, J.' })
|
105
|
+
expect(n.normalize_translator(:translator => 'In der Übersetzung von J Doe')).to eq({ :translator => 'Doe, J.' })
|
85
106
|
end
|
86
107
|
end
|
87
108
|
|
88
109
|
describe 'editors extraction' do
|
89
110
|
it 'recognizes editors in the author field' do
|
90
|
-
n.normalize_author(:author => 'D. Knuth (ed.)').
|
111
|
+
expect(n.normalize_author(:author => 'D. Knuth (ed.)')).to eq({ :editor => 'Knuth, D.' })
|
91
112
|
end
|
92
113
|
end
|
93
114
|
|
94
115
|
describe 'URL extraction' do
|
95
116
|
it 'recognizes full URLs' do
|
96
|
-
n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf').
|
97
|
-
n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf [Retrieved today]').
|
117
|
+
expect(n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf')).to eq({ :url => 'https://www.example.org/x.pdf' })
|
118
|
+
expect(n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf [Retrieved today]')).to eq({ :url => 'https://www.example.org/x.pdf' })
|
98
119
|
end
|
99
120
|
|
100
121
|
it 'tries to detect URLs without protocol' do
|
101
|
-
n.normalize_url(:url => 'Available at: www.example.org/x.pdf').
|
102
|
-
n.normalize_url(:url => 'Available at: example.org/x.pdf [Retrieved today]').
|
122
|
+
expect(n.normalize_url(:url => 'Available at: www.example.org/x.pdf')).to eq({ :url => 'www.example.org/x.pdf' })
|
123
|
+
expect(n.normalize_url(:url => 'Available at: example.org/x.pdf [Retrieved today]')).to eq({ :url => 'example.org/x.pdf' })
|
103
124
|
end
|
104
125
|
end
|
105
126
|
|
106
127
|
describe 'date extraction' do
|
107
128
|
it 'extracts month and year from a string like "(July 2009)"' do
|
108
|
-
h = Normalizer.instance.normalize_date(:date => '(July 2009)')
|
109
|
-
h[:
|
110
|
-
h[:month].should == 7
|
111
|
-
h.should_not have_key(:date)
|
112
|
-
h.should_not have_key(:day)
|
129
|
+
h = Normalizer.instance.normalize_date(:date => ['(July 2009)'])
|
130
|
+
expect(h[:date]).to eq('2009-07')
|
113
131
|
end
|
114
132
|
|
115
133
|
it 'extracts month and year from a string like "(1997 Sept.)"' do
|
116
134
|
h = Normalizer.instance.normalize_date(:date => '(1997 Sept.)')
|
117
|
-
h[:
|
118
|
-
h[:month].should == 9
|
119
|
-
h.should_not have_key(:date)
|
120
|
-
h.should_not have_key(:day)
|
135
|
+
expect(h[:date]).to eq('1997-09')
|
121
136
|
|
122
|
-
h = Normalizer.instance.normalize_date(:date => '(1997 Okt.)')
|
123
|
-
h[:
|
124
|
-
h[:month].should == 10
|
125
|
-
h.should_not have_key(:day)
|
137
|
+
h = Normalizer.instance.normalize_date(:date => ['(1997 Okt.)'])
|
138
|
+
expect(h[:date]).to eq('1997-10')
|
126
139
|
end
|
127
140
|
|
128
141
|
it 'extracts days if month and year are present' do
|
129
|
-
h = n.normalize_date(:date => '(15 May 1984)')
|
130
|
-
|
131
|
-
h[:year].should == 1984
|
132
|
-
h[:month].should == 5
|
133
|
-
h[:day].should == 15
|
142
|
+
h = n.normalize_date(:date => ['(15 May 1984)'])
|
143
|
+
expect(h[:date]).to eq('1984-05-15')
|
134
144
|
end
|
135
145
|
end
|
136
146
|
|
@@ -7,40 +7,40 @@ module Anystyle::Parser
|
|
7
7
|
|
8
8
|
describe "#tokenize" do
|
9
9
|
it "returns [] when given an empty string" do
|
10
|
-
subject.tokenize('').
|
10
|
+
expect(subject.tokenize('')).to eq([])
|
11
11
|
end
|
12
12
|
|
13
13
|
it "takes a single line and returns an array of token sequences" do
|
14
|
-
subject.tokenize('hello, world!').
|
14
|
+
expect(subject.tokenize('hello, world!')).to eq([%w{ hello, world! }])
|
15
15
|
end
|
16
16
|
|
17
17
|
it "tokenizes volume/page-range exception" do
|
18
|
-
subject.tokenize('hello:world! http://abc.com 3:45 3:1-2 23:1').
|
18
|
+
expect(subject.tokenize('hello:world! http://abc.com 3:45 3:1-2 23:1 45(3):23–7')).to eq([%w{ hello:world! http://abc.com 3: 45 3: 1-2 23: 1 45(3): 23–7}])
|
19
19
|
end
|
20
20
|
|
21
21
|
it "takes two lines and returns an array of token sequences" do
|
22
|
-
subject.tokenize("hello, world!\ngoodbye!").
|
22
|
+
expect(subject.tokenize("hello, world!\ngoodbye!")).to eq([%w{ hello, world! }, %w{ goodbye! }])
|
23
23
|
end
|
24
24
|
|
25
25
|
context "when passing a string marked as tagged" do
|
26
26
|
it "returns [] when given an empty string" do
|
27
|
-
subject.tokenize('', true).
|
27
|
+
expect(subject.tokenize('', true)).to eq([])
|
28
28
|
end
|
29
29
|
|
30
30
|
it "returns an array of :unknown token sequences when given an untagged single line" do
|
31
|
-
subject.tokenize('hello, world!', true).
|
31
|
+
expect(subject.tokenize('hello, world!', true)).to eq([[['hello,', :unknown], ['world!', :unknown]]])
|
32
32
|
end
|
33
33
|
|
34
34
|
it "returns an array of :unknown token sequences when given two untagged lines" do
|
35
|
-
subject.tokenize("hello,\nworld!", true).
|
35
|
+
expect(subject.tokenize("hello,\nworld!", true)).to eq([[['hello,', :unknown]], [['world!', :unknown]]])
|
36
36
|
end
|
37
37
|
|
38
38
|
it "returns an array of token/tag pair for each line when given a single tagged string" do
|
39
|
-
subject.tokenize('<a>hello</a>', true).
|
39
|
+
expect(subject.tokenize('<a>hello</a>', true)).to eq([[['hello', :a]]])
|
40
40
|
end
|
41
41
|
|
42
42
|
it "returns an array of token/tag pair for each line when given a string with multiple tags" do
|
43
|
-
subject.tokenize('<a>hello world</a> <b> !</b>', true).
|
43
|
+
expect(subject.tokenize('<a>hello world</a> <b> !</b>', true)).to eq([[['hello',:a], ['world', :a], ['!', :b]]])
|
44
44
|
end
|
45
45
|
|
46
46
|
it "raises an argument error if the string contains mismatched tags" do
|
@@ -53,22 +53,22 @@ module Anystyle::Parser
|
|
53
53
|
|
54
54
|
describe "#prepare" do
|
55
55
|
it 'returns an array of expanded token sequences' do
|
56
|
-
subject.prepare('hello, world!').
|
56
|
+
expect(subject.prepare('hello, world!')).to eq([['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']])
|
57
57
|
end
|
58
58
|
|
59
59
|
context 'when marking the input as being tagged' do
|
60
60
|
let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
|
61
61
|
|
62
62
|
it 'returns an array of expaned and labelled token sequences for a tagged string' do
|
63
|
-
subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.
|
63
|
+
expect(subject.prepare(input, true)[0].map { |t| t[/\S+$/] }).to eq(%w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date })
|
64
64
|
end
|
65
65
|
|
66
66
|
it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
|
67
|
-
subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.
|
67
|
+
expect(subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }).to eq(%w{ unknown unknown })
|
68
68
|
end
|
69
69
|
|
70
70
|
it 'converts xml entitites' do
|
71
|
-
subject.prepare("<note>>> & foo</note>", true)[0].map { |t| t[/\S+/] }.
|
71
|
+
expect(subject.prepare("<note>>> & foo</note>", true)[0].map { |t| t[/\S+/] }).to eq(%w{ >> & foo })
|
72
72
|
end
|
73
73
|
end
|
74
74
|
end
|
@@ -77,31 +77,31 @@ module Anystyle::Parser
|
|
77
77
|
let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
|
78
78
|
|
79
79
|
it 'returns an array of labelled segments' do
|
80
|
-
subject.label(citation)[0].map(&:first).
|
80
|
+
expect(subject.label(citation)[0].map(&:first)).to eq([:author, :title, :location, :publisher, :date, :pages])
|
81
81
|
end
|
82
82
|
|
83
83
|
describe 'when passed more than one line' do
|
84
84
|
it 'returns two arrays' do
|
85
|
-
subject.label("foo\nbar").
|
85
|
+
expect(subject.label("foo\nbar").size).to eq(2)
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
89
|
describe 'when passed invalid input' do
|
90
90
|
it 'returns an empty array for an empty string' do
|
91
|
-
subject.label('').
|
91
|
+
expect(subject.label('')).to eq([])
|
92
92
|
end
|
93
93
|
|
94
94
|
it 'returns an empty array for empty lines' do
|
95
|
-
subject.label("\n").
|
96
|
-
subject.label("\n ").
|
97
|
-
subject.label(" \n ").
|
98
|
-
subject.label(" \n").
|
95
|
+
expect(subject.label("\n")).to eq([])
|
96
|
+
expect(subject.label("\n ")).to eq([])
|
97
|
+
expect(subject.label(" \n ")).to eq([])
|
98
|
+
expect(subject.label(" \n")).to eq([])
|
99
99
|
end
|
100
100
|
|
101
101
|
it 'does not fail for unrecognizable input' do
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
expect { subject.label("@misc{70213094902020,\n") }.not_to raise_error
|
103
|
+
expect { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.not_to raise_error
|
104
|
+
expect { subject.label("\n doi ") }.not_to raise_error
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
@@ -112,28 +112,36 @@ module Anystyle::Parser
|
|
112
112
|
let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
|
113
113
|
|
114
114
|
it 'returns a hash of label/segment pairs by default' do
|
115
|
-
subject.parse(citation)[0].
|
115
|
+
expect(subject.parse(citation)[0]).to eq({
|
116
|
+
:author => 'Perec, Georges',
|
117
|
+
:title => 'A Void',
|
118
|
+
:location => 'London',
|
119
|
+
:publisher => 'The Harvill Press',
|
120
|
+
:date => '1995',
|
121
|
+
:pages => '108',
|
122
|
+
:type => :book
|
123
|
+
})
|
116
124
|
end
|
117
125
|
|
118
126
|
describe 'using output format "tags"' do
|
119
127
|
it 'returns a tagged string' do
|
120
|
-
subject.parse(citation, :tags)[0].
|
128
|
+
expect(subject.parse(citation, :tags)[0]).to eq('<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>')
|
121
129
|
end
|
122
130
|
end
|
123
131
|
|
124
132
|
it 'returns the label/token arrays for format "raw"' do
|
125
|
-
subject.parse(citation, :raw)[0][0].
|
133
|
+
expect(subject.parse(citation, :raw)[0][0]).to eq([:author, 'Perec,'])
|
126
134
|
end
|
127
135
|
|
128
136
|
it 'returns the token in original order for format "raw"' do
|
129
|
-
subject.parse(citation, :raw)[0].map(&:last).join(' ').
|
137
|
+
expect(subject.parse(citation, :raw)[0].map(&:last).join(' ')).to eq(citation)
|
130
138
|
|
131
139
|
difference = 'Derrida, J. (1967). L’écriture et la différence (1 éd.). Paris: Éditions du Seuil.'
|
132
|
-
subject.parse(difference, :raw)[0].map(&:last).join(' ').
|
140
|
+
expect(subject.parse(difference, :raw)[0].map(&:last).join(' ')).to eq(difference)
|
133
141
|
end
|
134
142
|
|
135
143
|
it 'returns xml document for format "raw"' do
|
136
|
-
subject.parse(citation, :xml).
|
144
|
+
expect(subject.parse(citation, :xml)).to eq('<?xml version="1.0" encoding="UTF-8"?><references><reference><author>Perec, Georges.</author><title>A Void.</title><location>London:</location><publisher>The Harvill Press,</publisher><date>1995.</date><pages>p.108.</pages></reference></references>')
|
137
145
|
end
|
138
146
|
end
|
139
147
|
|
@@ -145,14 +153,14 @@ module Anystyle::Parser
|
|
145
153
|
|
146
154
|
it 'recognizes trained references' do
|
147
155
|
subject.learn dps[0]
|
148
|
-
subject.parse(strip_tags(dps[0]), :tags)[0].
|
156
|
+
expect(subject.parse(strip_tags(dps[0]), :tags)[0]).to eq(dps[0])
|
149
157
|
end
|
150
158
|
|
151
159
|
it 'recognizes trained references when learnt in one go' do
|
152
160
|
subject.learn dps
|
153
161
|
|
154
162
|
dps.each do |d|
|
155
|
-
subject.parse(strip_tags(d), :tags)[0].
|
163
|
+
expect(subject.parse(strip_tags(d), :tags)[0]).to eq(d)
|
156
164
|
end
|
157
165
|
end
|
158
166
|
|
@@ -164,7 +172,7 @@ module Anystyle::Parser
|
|
164
172
|
end
|
165
173
|
|
166
174
|
dps.each do |d|
|
167
|
-
subject.parse(strip_tags(d), :tags)[0].
|
175
|
+
expect(subject.parse(strip_tags(d), :tags)[0]).to eq(d)
|
168
176
|
end
|
169
177
|
end
|
170
178
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bibtex-ruby
|
@@ -64,20 +64,14 @@ dependencies:
|
|
64
64
|
requirements:
|
65
65
|
- - ~>
|
66
66
|
- !ruby/object:Gem::Version
|
67
|
-
version: '0.
|
68
|
-
- - '>='
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
version: 0.8.7
|
67
|
+
version: '0.9'
|
71
68
|
type: :runtime
|
72
69
|
prerelease: false
|
73
70
|
version_requirements: !ruby/object:Gem::Requirement
|
74
71
|
requirements:
|
75
72
|
- - ~>
|
76
73
|
- !ruby/object:Gem::Version
|
77
|
-
version: '0.
|
78
|
-
- - '>='
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: 0.8.7
|
74
|
+
version: '0.9'
|
81
75
|
description: A sophisticated parser for academic reference lists and bibliographies
|
82
76
|
based on machine learning algorithms using conditional random fields.
|
83
77
|
email:
|