anystyle-parser 0.6.9 → 0.6.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b75781230fa07b3d6d751a7222913923193f173f
4
- data.tar.gz: 762ecbac1a452a771c8b1eb6670cbce7a9a86ed6
3
+ metadata.gz: 978c32592fd4e4edade7f5648a1b3095d652a68b
4
+ data.tar.gz: e310a276de3bb0e4c94ca4df72ed8dda378a9fd6
5
5
  SHA512:
6
- metadata.gz: d9b61a7966fd8cf7024180ecfe19c055b0d2bdeb1dde5f2a5a727d8e7c18ff65e11b607bdf4b164d6b6e8bb0eb651ea5459e3a3e96835c3fe32337e07e0eebd6
7
- data.tar.gz: b1b0ef651aba9e2aa2ace965c8cff1fa90ed9168682fe5078ad94a784051d3b3594a575a1adaf3065094193646a842cac3203736d5a9321c77299cc6dc0df806
6
+ metadata.gz: 8ef818cc5486835a51de69f4cc30d46975e162c0b0b8a7cd2d08ab3e94a3956e7ffecaa1f6e4755a29b186d3f9df56b89bd6e554e8defc64f8da7e2a1fc50c8f
7
+ data.tar.gz: ae731f1ace948eafa16fe20bf921b48debe52f4656368f4da3ae16b2b8a6612bd2d7b09b728209558b9a27fd2f734e65752012fd714d6521983d98ff0ba96245
data/Gemfile CHANGED
@@ -4,7 +4,7 @@ gemspec
4
4
  group :development, :test do
5
5
  gem 'rake'
6
6
  gem 'cucumber'
7
- gem 'rspec'
7
+ gem 'rspec', '~>3.0'
8
8
  gem 'simplecov', '~>0.8', :require => false
9
9
  gem 'rubinius-coverage', :platform => :rbx
10
10
  gem 'coveralls', :require => false
data/README.md CHANGED
@@ -17,7 +17,9 @@ CSL/CiteProc JSON format).
17
17
  Web Application and Web Service
18
18
  -------------------------------
19
19
  Anystyle-Parser is avaialble as a web application and a web service at
20
- [http://anystyle.io](http://anystyle.io).
20
+ [http://anystyle.io](http://anystyle.io). For example Ruby code using
21
+ the anystyle.io API, see this [prototype](https://gist.github.com/inukshuk/f1d47aeab1f778bca8ce)
22
+ for a style predictor.
21
23
 
22
24
  Installation
23
25
  ------------
@@ -20,7 +20,7 @@ Gem::Specification.new do |s|
20
20
  s.add_runtime_dependency('bibtex-ruby', '~>3.0')
21
21
  s.add_runtime_dependency('builder', '>=3.0', '<4.0')
22
22
  s.add_runtime_dependency('wapiti', '~>0.1')
23
- s.add_runtime_dependency('namae', '~>0.8', '>=0.8.7')
23
+ s.add_runtime_dependency('namae', '~>0.9')
24
24
 
25
25
  s.files = `git ls-files`.split("\n").reject { |path|
26
26
  path.start_with?('.')
@@ -58,6 +58,16 @@ module Anystyle
58
58
  hash
59
59
  end
60
60
 
61
+ def normalize_accessed(hash)
62
+ token, *dangling = hash[:accessed]
63
+ unmatched(:accessed, hash, dangling) unless dangling.empty?
64
+
65
+ token.gsub!(/(accessed|retrieved):?\s*/i, '')
66
+
67
+ hash[:accessed] = token
68
+ hash
69
+ end
70
+
61
71
  def normalize_key(hash)
62
72
  token, *dangling = hash[:key]
63
73
  unmatched(:key, hash, dangling) unless dangling.empty?
@@ -114,8 +124,8 @@ module Anystyle
114
124
 
115
125
  editors.gsub!(/^\W+|\W+$/, '')
116
126
  editors.gsub!(/^in:?\s+/i, '')
117
- editors.gsub!(/[^[:alpha:]]*[Ee]d(s|itors?|ited)?\b[^[:alpha:]]*/, '')
118
- editors.gsub!(/[^[:alpha:]]*([Hh]rsg|Herausgeber)\b[^[:alpha:]]*/, '')
127
+ editors.gsub!(/\W*\b[Ee]d(s|itors?|ited)?\b\W*/, '')
128
+ editors.gsub!(/\W*\b([Hh]rsg|gg?|Herausgeber)\b\W*/, '')
119
129
  editors.gsub!(/\b[Hh]erausgegeben von\b/, '')
120
130
  editors.gsub!(/\bby\b/i, '')
121
131
 
@@ -134,8 +144,9 @@ module Anystyle
134
144
  def normalize_translator(hash)
135
145
  translators = hash[:translator]
136
146
 
147
+ translators.gsub!(/\b([Ii]n (d|ein)er )?[Üü]ber(s\.|setzt|setzung|tragen|tragung) v(\.|on\b)/, '')
137
148
  translators.gsub!(/^\W+|\W+$/, '')
138
- translators.gsub!(/[^[:alpha:]]*trans(lated)?\b[^[:alpha:]]*/i, '')
149
+ translators.gsub!(/[^[:alpha:]]*\btrans(l(ated)?)?\b[^[:alpha:]]*/i, '')
139
150
  translators.gsub!(/\bby\b/i, '')
140
151
 
141
152
  hash[:translator] = normalize_names(translators)
@@ -170,10 +181,7 @@ module Anystyle
170
181
  names.gsub!(/;|:/, ',')
171
182
 
172
183
  Namae.parse!(names).map { |name|
173
- unless name.given.nil? || name.family.nil?
174
- name.given.gsub!(/\b([[:upper:]])(\s|$)/, '\1.\2')
175
- end
176
-
184
+ name.normalize_initials
177
185
  name.sort_order
178
186
 
179
187
  }.join(' and ')
@@ -270,21 +278,20 @@ module Anystyle
270
278
  end
271
279
 
272
280
  def normalize_date(hash)
273
- date, *dangling = hash[:date]
274
- unmatched(:date, hash, dangling) unless dangling.empty?
281
+ date = Array(hash[:date]).join(' ')
275
282
 
276
283
  unless (month = MONTH[date]).nil?
277
- hash[:month] = month
284
+ month = '%02d' % month
278
285
  end
279
286
 
280
287
  if date =~ /(\d{4})/
281
- hash[:year] = $1.to_i
288
+ year = $1
282
289
 
283
- if hash.key?(:month) && date =~ /\b(\d{1,2})\b/
284
- hash[:day] = $1.to_i
290
+ if month && date =~ /\b(\d{1,2})\b/
291
+ day = '%02d' % $1.to_i
285
292
  end
286
293
 
287
- hash.delete(:date)
294
+ hash[:date] = [year, month, day].compact.join('-')
288
295
  end
289
296
 
290
297
  hash
@@ -3,17 +3,17 @@ module Anystyle
3
3
 
4
4
  class Parser
5
5
 
6
- @formats = [:bibtex, :hash, :citeproc, :xml, :tags, :raw].freeze
6
+ @formats = [:bibtex, :hash, :normalized, :citeproc, :xml, :tags, :raw].freeze
7
7
 
8
8
  @defaults = {
9
9
  :model => File.expand_path('../support/anystyle.mod', __FILE__),
10
10
  :pattern => File.expand_path('../support/anystyle.pat', __FILE__),
11
11
  :compact => true,
12
12
  :threads => 4,
13
- :separator => /\s+|\b(\d+:)/,
13
+ :separator => /\s+|\b(\d\S*:)/,
14
14
  :tagged_separator => /\s+|(<\/?[^>]+>)/,
15
15
  :strip => /[^[:alnum:]]/,
16
- :format => :hash,
16
+ :format => :normalized,
17
17
  :xml_entities => Hash[*%w{ &amp; & &lt; < &gt; > &apos; ' &quot; " }],
18
18
  :training_data => File.expand_path('../../../../resources/train.txt', __FILE__)
19
19
  }.freeze
@@ -258,8 +258,17 @@ module Anystyle
258
258
 
259
259
  def format_bibtex(labels)
260
260
  b = BibTeX::Bibliography.new
261
- format_hash(labels).each do |hash|
261
+ format_normalized(labels).each do |hash|
262
262
  hash[:address] = hash.delete :location if hash.key?(:location)
263
+ hash[:urldate] = hash.delete :accessed if hash.key?(:accessed)
264
+
265
+ if hash.key?(:authority)
266
+ if [:techreport,:thesis].include?(hash[:type])
267
+ hash[:institution] = hash.delete :authority
268
+ else
269
+ hash[:organization] = hash.delete :authority
270
+ end
271
+ end
263
272
 
264
273
  b << BibTeX::Entry.new(hash)
265
274
  end
@@ -276,7 +285,7 @@ module Anystyle
276
285
 
277
286
  def format_hash(labels)
278
287
  labels.map do |line|
279
- hash = line.inject({}) do |h, (label, token)|
288
+ line.inject({}) do |h, (label, token)|
280
289
  if h.has_key?(label)
281
290
  h[label] = [h[label]].flatten << token
282
291
  else
@@ -284,10 +293,13 @@ module Anystyle
284
293
  end
285
294
  h
286
295
  end
287
- normalize hash
288
296
  end
289
297
  end
290
298
 
299
+ def format_normalized(labels)
300
+ format_hash(labels).map { |h| normalize h }
301
+ end
302
+
291
303
  def format_citeproc(labels)
292
304
  format_bibtex(labels).to_citeproc
293
305
  end
@@ -1,5 +1,5 @@
1
1
  module Anystyle
2
2
  module Parser
3
- VERSION = '0.6.9'.freeze
3
+ VERSION = '0.6.10'.freeze
4
4
  end
5
5
  end
@@ -7,16 +7,16 @@ module Anystyle
7
7
 
8
8
  let(:dict) { Dictionary.instance }
9
9
 
10
- it { Dictionary.should_not respond_to(:new) }
11
- it { dict.should_not be nil }
10
+ it { expect(Dictionary).not_to respond_to(:new) }
11
+ it { expect(dict).not_to be nil }
12
12
 
13
13
  describe '.modes' do
14
14
  it 'returns an array' do
15
- Dictionary.modes.should be_a(Array)
15
+ expect(Dictionary.modes).to be_a(Array)
16
16
  end
17
17
 
18
18
  it 'contains at least :hash' do
19
- Dictionary.modes.should include(:hash)
19
+ expect(Dictionary.modes).to include(:hash)
20
20
  end
21
21
  end
22
22
 
@@ -24,12 +24,12 @@ module Anystyle
24
24
 
25
25
  %w{ philippines italy }.each do |place|
26
26
  it "#{place.inspect} should be a place name" do
27
- dict[place].should == Dictionary.code[:place]
27
+ expect(dict[place]).to eq(Dictionary.code[:place])
28
28
  end
29
29
  end
30
30
 
31
31
  it "accepts unicode strins like 'çela' (surname)" do
32
- (dict['çela'] & Dictionary.code[:surname]).should > 0
32
+ expect(dict['çela'] & Dictionary.code[:surname]).to be > 0
33
33
  end
34
34
 
35
35
  end
@@ -8,19 +8,19 @@ module Anystyle::Parser
8
8
 
9
9
  %w{ (1992) 1992 2011 1776 }.each do |year|
10
10
  it "returns :year for #{year.inspect}" do
11
- f.match(year).should == :year
11
+ expect(f.match(year)).to eq(:year)
12
12
  end
13
13
  end
14
14
 
15
15
  %w{ (1) (12) (123) }.each do |year|
16
16
  it "returns :year for #{year.inspect}" do
17
- f.match(year).should == :numeric
17
+ expect(f.match(year)).to eq(:numeric)
18
18
  end
19
19
  end
20
20
 
21
21
  ['pp', 'pp.', '23-4', '6124--19', '48 - 9', '19–27'].each do |page|
22
22
  it "returns :page for #{page.inspect}" do
23
- f.match(page).should == :page
23
+ expect(f.match(page)).to eq(:page)
24
24
  end
25
25
  end
26
26
 
@@ -9,52 +9,56 @@ module Anystyle
9
9
  describe "#tokenize_names" do
10
10
 
11
11
  it "tokenizes 'A B'" do
12
- Normalizer.instance.normalize_names('A B').should == 'B, A.'
12
+ expect(Normalizer.instance.normalize_names('A B')).to eq('B, A.')
13
13
  end
14
14
 
15
15
  it "tokenizes 'A, B'" do
16
- Normalizer.instance.normalize_names('A, B').should == 'A, B.'
16
+ expect(Normalizer.instance.normalize_names('A, B')).to eq('A, B.')
17
17
  end
18
18
 
19
19
  it "tokenizes 'A, jr., Bbb'" do
20
- Normalizer.instance.normalize_names('A, jr., B').should == 'A, jr., B.'
20
+ expect(Normalizer.instance.normalize_names('A, jr., B')).to eq('A, jr., B.')
21
21
  end
22
22
 
23
23
  it "tokenizes 'A, B, jr.'" do
24
- Normalizer.instance.normalize_names('A, B, jr.').should == 'A, jr., B.'
24
+ expect(Normalizer.instance.normalize_names('A, B, jr.')).to eq('A, jr., B.')
25
25
  end
26
26
 
27
27
  it "tokenizes 'A, B, C, D'" do
28
- Normalizer.instance.normalize_names('A, B, C, D').should == 'A, B. and C, D.'
28
+ expect(Normalizer.instance.normalize_names('A, B, C, D')).to eq('A, B. and C, D.')
29
29
  end
30
30
 
31
31
  it "tokenizes 'A, B, C'" do
32
- Normalizer.instance.normalize_names('A, B, C').should == 'A, B. and C'
32
+ expect(Normalizer.instance.normalize_names('A, B, C')).to eq('A, B. and C.')
33
33
  end
34
34
 
35
35
  it "tokenizes 'Aa Bb, C.'" do
36
- Normalizer.instance.normalize_names('Aa Bb, C.').should == 'Aa Bb, C.'
36
+ expect(Normalizer.instance.normalize_names('Aa Bb, C.')).to eq('Aa Bb, C.')
37
37
  end
38
38
 
39
39
  it "tokenizes 'Plath, L.C., Asgaard, G., ... Botros, N.'" do
40
- Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., ... Botros, N.').should == 'Plath, L.C. and Asgaard, G. and Botros, N.'
41
- Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., … Botros, N.').should == 'Plath, L.C. and Asgaard, G. and Botros, N.'
40
+ expect(Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., ... Botros, N.')).to eq('Plath, L.C. and Asgaard, G. and Botros, N.')
41
+ expect(Normalizer.instance.normalize_names('Plath, L.C., Asgaard, G., … Botros, N.')).to eq('Plath, L.C. and Asgaard, G. and Botros, N.')
42
42
  end
43
43
 
44
44
  it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
45
- Normalizer.instance.normalize_names('Aa Bb, Cc Dd, and E F G').should == 'Bb, Aa and Dd, Cc and G, E. F.'
45
+ expect(Normalizer.instance.normalize_names('Aa Bb, Cc Dd, and E F G')).to eq('Bb, Aa and Dd, Cc and G, E.F.')
46
46
  end
47
47
 
48
48
  [
49
49
  ['Poe, Edgar A.', 'Poe, Edgar A.'],
50
50
  ['Edgar A. Poe', 'Poe, Edgar A.'],
51
+ ['J Doe', 'Doe, J.'],
52
+ ['Doe, J', 'Doe, J.'],
53
+ ['JE Doe', 'Doe, J.E.'],
54
+ ['Doe, JE', 'Doe, J.E.'],
51
55
  ['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
52
56
  ['Edgar A. Poe; Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
53
57
  ['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
54
58
  ['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
55
59
  ].each do |name, normalized|
56
60
  it "tokenizes #{name.inspect}" do
57
- Normalizer.instance.normalize_names(name).should == normalized
61
+ expect(Normalizer.instance.normalize_names(name)).to eq(normalized)
58
62
  end
59
63
  end
60
64
 
@@ -62,75 +66,81 @@ module Anystyle
62
66
 
63
67
  describe '#normalize_editor' do
64
68
  it "strips in from beginning" do
65
- n.normalize_editor(:editor => 'In D. Knuth (ed.)').should == { :editor => 'Knuth, D.' }
66
- n.normalize_editor(:editor => 'In: D. Knuth (ed.)').should == { :editor => 'Knuth, D.' }
67
- n.normalize_editor(:editor => 'in: D. Knuth ed.').should == { :editor => 'Knuth, D.' }
68
- n.normalize_editor(:editor => 'in D. Knuth (ed)').should == { :editor => 'Knuth, D.' }
69
+ expect(n.normalize_editor(:editor => 'In D. Knuth (ed.)')).to eq({ :editor => 'Knuth, D.' })
70
+ expect(n.normalize_editor(:editor => 'In: D. Knuth (ed.)')).to eq({ :editor => 'Knuth, D.' })
71
+ expect(n.normalize_editor(:editor => 'in: D. Knuth ed.')).to eq({ :editor => 'Knuth, D.' })
72
+ expect(n.normalize_editor(:editor => 'in D. Knuth (ed)')).to eq({ :editor => 'Knuth, D.' })
69
73
  end
70
74
 
71
75
  it "does not strip ed from name" do
72
- n.normalize_editor(:editor => 'In Edward Wood').should == { :editor => 'Wood, Edward' }
73
- n.normalize_editor(:editor => 'ed by Edward Wood').should == { :editor => 'Wood, Edward' }
74
- n.normalize_editor(:editor => 'ed. by Edward Wood').should == { :editor => 'Wood, Edward' }
75
- n.normalize_editor(:editor => 'ed by Edward Wood').should == { :editor => 'Wood, Edward' }
76
+ expect(n.normalize_editor(:editor => 'In Edward Wood')).to eq({ :editor => 'Wood, Edward' })
77
+ expect(n.normalize_editor(:editor => 'ed by Edward Wood')).to eq({ :editor => 'Wood, Edward' })
78
+ expect(n.normalize_editor(:editor => 'ed. by Edward Wood')).to eq({ :editor => 'Wood, Edward' })
79
+ expect(n.normalize_editor(:editor => 'ed by Edward Wood')).to eq({ :editor => 'Wood, Edward' })
80
+ expect(n.normalize_editor(:editor => 'In Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
81
+ expect(n.normalize_editor(:editor => 'ed by Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
82
+ expect(n.normalize_editor(:editor => 'ed. by Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
83
+ expect(n.normalize_editor(:editor => 'ed by Alfred Wood')).to eq({ :editor => 'Wood, Alfred' })
76
84
  end
77
85
 
78
86
  it "strips et al" do
79
- n.normalize_editor(:editor => 'Edward Wood et al')[:editor].should == 'Wood, Edward'
80
- n.normalize_editor(:editor => 'Edward Wood et al.')[:editor].should == 'Wood, Edward'
81
- n.normalize_editor(:editor => 'Edward Wood u.a.')[:editor].should == 'Wood, Edward'
82
- n.normalize_editor(:editor => 'Edward Wood u. a.')[:editor].should == 'Wood, Edward'
83
- n.normalize_editor(:editor => 'Edward Wood and others')[:editor].should == 'Wood, Edward'
84
- n.normalize_editor(:editor => 'Edward Wood & others')[:editor].should == 'Wood, Edward'
87
+ expect(n.normalize_editor(:editor => 'Edward Wood et al')[:editor]).to eq('Wood, Edward')
88
+ expect(n.normalize_editor(:editor => 'Edward Wood et al.')[:editor]).to eq('Wood, Edward')
89
+ expect(n.normalize_editor(:editor => 'Edward Wood u.a.')[:editor]).to eq('Wood, Edward')
90
+ expect(n.normalize_editor(:editor => 'Edward Wood u. a.')[:editor]).to eq('Wood, Edward')
91
+ expect(n.normalize_editor(:editor => 'Edward Wood and others')[:editor]).to eq('Wood, Edward')
92
+ expect(n.normalize_editor(:editor => 'Edward Wood & others')[:editor]).to eq('Wood, Edward')
93
+ end
94
+ end
95
+
96
+ describe '#normalize_translator' do
97
+ it "strips in from beginning" do
98
+ expect(n.normalize_translator(:translator => 'Translated by J Doe')).to eq({ :translator => 'Doe, J.' })
99
+ expect(n.normalize_translator(:translator => 'Trans by J Doe')).to eq({ :translator => 'Doe, J.' })
100
+ expect(n.normalize_translator(:translator => 'Trans. by J Doe')).to eq({ :translator => 'Doe, J.' })
101
+ expect(n.normalize_translator(:translator => 'Transl. J Doe')).to eq({ :translator => 'Doe, J.' })
102
+ expect(n.normalize_translator(:translator => 'übersetzt von J Doe')).to eq({ :translator => 'Doe, J.' })
103
+ expect(n.normalize_translator(:translator => 'übers. v. J Doe')).to eq({ :translator => 'Doe, J.' })
104
+ expect(n.normalize_translator(:translator => 'Übersetzung v. J Doe')).to eq({ :translator => 'Doe, J.' })
105
+ expect(n.normalize_translator(:translator => 'In der Übersetzung von J Doe')).to eq({ :translator => 'Doe, J.' })
85
106
  end
86
107
  end
87
108
 
88
109
  describe 'editors extraction' do
89
110
  it 'recognizes editors in the author field' do
90
- n.normalize_author(:author => 'D. Knuth (ed.)').should == { :editor => 'Knuth, D.' }
111
+ expect(n.normalize_author(:author => 'D. Knuth (ed.)')).to eq({ :editor => 'Knuth, D.' })
91
112
  end
92
113
  end
93
114
 
94
115
  describe 'URL extraction' do
95
116
  it 'recognizes full URLs' do
96
- n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf').should == { :url => 'https://www.example.org/x.pdf' }
97
- n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf [Retrieved today]').should == { :url => 'https://www.example.org/x.pdf' }
117
+ expect(n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf')).to eq({ :url => 'https://www.example.org/x.pdf' })
118
+ expect(n.normalize_url(:url => 'Available at: https://www.example.org/x.pdf [Retrieved today]')).to eq({ :url => 'https://www.example.org/x.pdf' })
98
119
  end
99
120
 
100
121
  it 'tries to detect URLs without protocol' do
101
- n.normalize_url(:url => 'Available at: www.example.org/x.pdf').should == { :url => 'www.example.org/x.pdf' }
102
- n.normalize_url(:url => 'Available at: example.org/x.pdf [Retrieved today]').should == { :url => 'example.org/x.pdf' }
122
+ expect(n.normalize_url(:url => 'Available at: www.example.org/x.pdf')).to eq({ :url => 'www.example.org/x.pdf' })
123
+ expect(n.normalize_url(:url => 'Available at: example.org/x.pdf [Retrieved today]')).to eq({ :url => 'example.org/x.pdf' })
103
124
  end
104
125
  end
105
126
 
106
127
  describe 'date extraction' do
107
128
  it 'extracts month and year from a string like "(July 2009)"' do
108
- h = Normalizer.instance.normalize_date(:date => '(July 2009)')
109
- h[:year].should == 2009
110
- h[:month].should == 7
111
- h.should_not have_key(:date)
112
- h.should_not have_key(:day)
129
+ h = Normalizer.instance.normalize_date(:date => ['(July 2009)'])
130
+ expect(h[:date]).to eq('2009-07')
113
131
  end
114
132
 
115
133
  it 'extracts month and year from a string like "(1997 Sept.)"' do
116
134
  h = Normalizer.instance.normalize_date(:date => '(1997 Sept.)')
117
- h[:year].should == 1997
118
- h[:month].should == 9
119
- h.should_not have_key(:date)
120
- h.should_not have_key(:day)
135
+ expect(h[:date]).to eq('1997-09')
121
136
 
122
- h = Normalizer.instance.normalize_date(:date => '(1997 Okt.)')
123
- h[:year].should == 1997
124
- h[:month].should == 10
125
- h.should_not have_key(:day)
137
+ h = Normalizer.instance.normalize_date(:date => ['(1997 Okt.)'])
138
+ expect(h[:date]).to eq('1997-10')
126
139
  end
127
140
 
128
141
  it 'extracts days if month and year are present' do
129
- h = n.normalize_date(:date => '(15 May 1984)')
130
-
131
- h[:year].should == 1984
132
- h[:month].should == 5
133
- h[:day].should == 15
142
+ h = n.normalize_date(:date => ['(15 May 1984)'])
143
+ expect(h[:date]).to eq('1984-05-15')
134
144
  end
135
145
  end
136
146
 
@@ -7,40 +7,40 @@ module Anystyle::Parser
7
7
 
8
8
  describe "#tokenize" do
9
9
  it "returns [] when given an empty string" do
10
- subject.tokenize('').should == []
10
+ expect(subject.tokenize('')).to eq([])
11
11
  end
12
12
 
13
13
  it "takes a single line and returns an array of token sequences" do
14
- subject.tokenize('hello, world!').should == [%w{ hello, world! }]
14
+ expect(subject.tokenize('hello, world!')).to eq([%w{ hello, world! }])
15
15
  end
16
16
 
17
17
  it "tokenizes volume/page-range exception" do
18
- subject.tokenize('hello:world! http://abc.com 3:45 3:1-2 23:1').should == [%w{ hello:world! http://abc.com 3: 45 3: 1-2 23: 1 }]
18
+ expect(subject.tokenize('hello:world! http://abc.com 3:45 3:1-2 23:1 45(3):23–7')).to eq([%w{ hello:world! http://abc.com 3: 45 3: 1-2 23: 1 45(3): 23–7}])
19
19
  end
20
20
 
21
21
  it "takes two lines and returns an array of token sequences" do
22
- subject.tokenize("hello, world!\ngoodbye!").should == [%w{ hello, world! }, %w{ goodbye! }]
22
+ expect(subject.tokenize("hello, world!\ngoodbye!")).to eq([%w{ hello, world! }, %w{ goodbye! }])
23
23
  end
24
24
 
25
25
  context "when passing a string marked as tagged" do
26
26
  it "returns [] when given an empty string" do
27
- subject.tokenize('', true).should == []
27
+ expect(subject.tokenize('', true)).to eq([])
28
28
  end
29
29
 
30
30
  it "returns an array of :unknown token sequences when given an untagged single line" do
31
- subject.tokenize('hello, world!', true).should == [[['hello,', :unknown], ['world!', :unknown]]]
31
+ expect(subject.tokenize('hello, world!', true)).to eq([[['hello,', :unknown], ['world!', :unknown]]])
32
32
  end
33
33
 
34
34
  it "returns an array of :unknown token sequences when given two untagged lines" do
35
- subject.tokenize("hello,\nworld!", true).should == [[['hello,', :unknown]], [['world!', :unknown]]]
35
+ expect(subject.tokenize("hello,\nworld!", true)).to eq([[['hello,', :unknown]], [['world!', :unknown]]])
36
36
  end
37
37
 
38
38
  it "returns an array of token/tag pair for each line when given a single tagged string" do
39
- subject.tokenize('<a>hello</a>', true).should == [[['hello', :a]]]
39
+ expect(subject.tokenize('<a>hello</a>', true)).to eq([[['hello', :a]]])
40
40
  end
41
41
 
42
42
  it "returns an array of token/tag pair for each line when given a string with multiple tags" do
43
- subject.tokenize('<a>hello world</a> <b> !</b>', true).should == [[['hello',:a], ['world', :a], ['!', :b]]]
43
+ expect(subject.tokenize('<a>hello world</a> <b> !</b>', true)).to eq([[['hello',:a], ['world', :a], ['!', :b]]])
44
44
  end
45
45
 
46
46
  it "raises an argument error if the string contains mismatched tags" do
@@ -53,22 +53,22 @@ module Anystyle::Parser
53
53
 
54
54
  describe "#prepare" do
55
55
  it 'returns an array of expanded token sequences' do
56
- subject.prepare('hello, world!').should == [['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']]
56
+ expect(subject.prepare('hello, world!')).to eq([['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']])
57
57
  end
58
58
 
59
59
  context 'when marking the input as being tagged' do
60
60
  let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
61
61
 
62
62
  it 'returns an array of expaned and labelled token sequences for a tagged string' do
63
- subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.should == %w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date }
63
+ expect(subject.prepare(input, true)[0].map { |t| t[/\S+$/] }).to eq(%w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date })
64
64
  end
65
65
 
66
66
  it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
67
- subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.should == %w{ unknown unknown }
67
+ expect(subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }).to eq(%w{ unknown unknown })
68
68
  end
69
69
 
70
70
  it 'converts xml entitites' do
71
- subject.prepare("<note>&gt;&gt; &amp; foo</note>", true)[0].map { |t| t[/\S+/] }.should == %w{ >> & foo }
71
+ expect(subject.prepare("<note>&gt;&gt; &amp; foo</note>", true)[0].map { |t| t[/\S+/] }).to eq(%w{ >> & foo })
72
72
  end
73
73
  end
74
74
  end
@@ -77,31 +77,31 @@ module Anystyle::Parser
77
77
  let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
78
78
 
79
79
  it 'returns an array of labelled segments' do
80
- subject.label(citation)[0].map(&:first).should == [:author, :title, :location, :publisher, :date, :pages]
80
+ expect(subject.label(citation)[0].map(&:first)).to eq([:author, :title, :location, :publisher, :date, :pages])
81
81
  end
82
82
 
83
83
  describe 'when passed more than one line' do
84
84
  it 'returns two arrays' do
85
- subject.label("foo\nbar").should have(2).elements
85
+ expect(subject.label("foo\nbar").size).to eq(2)
86
86
  end
87
87
  end
88
88
 
89
89
  describe 'when passed invalid input' do
90
90
  it 'returns an empty array for an empty string' do
91
- subject.label('').should == []
91
+ expect(subject.label('')).to eq([])
92
92
  end
93
93
 
94
94
  it 'returns an empty array for empty lines' do
95
- subject.label("\n").should == []
96
- subject.label("\n ").should == []
97
- subject.label(" \n ").should == []
98
- subject.label(" \n").should == []
95
+ expect(subject.label("\n")).to eq([])
96
+ expect(subject.label("\n ")).to eq([])
97
+ expect(subject.label(" \n ")).to eq([])
98
+ expect(subject.label(" \n")).to eq([])
99
99
  end
100
100
 
101
101
  it 'does not fail for unrecognizable input' do
102
- lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
103
- lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
104
- lambda { subject.label("\n doi ") }.should_not raise_error
102
+ expect { subject.label("@misc{70213094902020,\n") }.not_to raise_error
103
+ expect { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.not_to raise_error
104
+ expect { subject.label("\n doi ") }.not_to raise_error
105
105
  end
106
106
  end
107
107
 
@@ -112,28 +112,36 @@ module Anystyle::Parser
112
112
  let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
113
113
 
114
114
  it 'returns a hash of label/segment pairs by default' do
115
- subject.parse(citation)[0].should == { :author => 'Perec, Georges', :title => 'A Void', :location => 'London', :publisher => 'The Harvill Press', :year => 1995, :pages => '108', :type => :book }
115
+ expect(subject.parse(citation)[0]).to eq({
116
+ :author => 'Perec, Georges',
117
+ :title => 'A Void',
118
+ :location => 'London',
119
+ :publisher => 'The Harvill Press',
120
+ :date => '1995',
121
+ :pages => '108',
122
+ :type => :book
123
+ })
116
124
  end
117
125
 
118
126
  describe 'using output format "tags"' do
119
127
  it 'returns a tagged string' do
120
- subject.parse(citation, :tags)[0].should == '<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>'
128
+ expect(subject.parse(citation, :tags)[0]).to eq('<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>')
121
129
  end
122
130
  end
123
131
 
124
132
  it 'returns the label/token arrays for format "raw"' do
125
- subject.parse(citation, :raw)[0][0].should == [:author, 'Perec,']
133
+ expect(subject.parse(citation, :raw)[0][0]).to eq([:author, 'Perec,'])
126
134
  end
127
135
 
128
136
  it 'returns the token in original order for format "raw"' do
129
- subject.parse(citation, :raw)[0].map(&:last).join(' ').should == citation
137
+ expect(subject.parse(citation, :raw)[0].map(&:last).join(' ')).to eq(citation)
130
138
 
131
139
  difference = 'Derrida, J. (1967). L’écriture et la différence (1 éd.). Paris: Éditions du Seuil.'
132
- subject.parse(difference, :raw)[0].map(&:last).join(' ').should == difference
140
+ expect(subject.parse(difference, :raw)[0].map(&:last).join(' ')).to eq(difference)
133
141
  end
134
142
 
135
143
  it 'returns xml document for format "raw"' do
136
- subject.parse(citation, :xml).should == '<?xml version="1.0" encoding="UTF-8"?><references><reference><author>Perec, Georges.</author><title>A Void.</title><location>London:</location><publisher>The Harvill Press,</publisher><date>1995.</date><pages>p.108.</pages></reference></references>'
144
+ expect(subject.parse(citation, :xml)).to eq('<?xml version="1.0" encoding="UTF-8"?><references><reference><author>Perec, Georges.</author><title>A Void.</title><location>London:</location><publisher>The Harvill Press,</publisher><date>1995.</date><pages>p.108.</pages></reference></references>')
137
145
  end
138
146
  end
139
147
 
@@ -145,14 +153,14 @@ module Anystyle::Parser
145
153
 
146
154
  it 'recognizes trained references' do
147
155
  subject.learn dps[0]
148
- subject.parse(strip_tags(dps[0]), :tags)[0].should == dps[0]
156
+ expect(subject.parse(strip_tags(dps[0]), :tags)[0]).to eq(dps[0])
149
157
  end
150
158
 
151
159
  it 'recognizes trained references when learnt in one go' do
152
160
  subject.learn dps
153
161
 
154
162
  dps.each do |d|
155
- subject.parse(strip_tags(d), :tags)[0].should == d
163
+ expect(subject.parse(strip_tags(d), :tags)[0]).to eq(d)
156
164
  end
157
165
  end
158
166
 
@@ -164,7 +172,7 @@ module Anystyle::Parser
164
172
  end
165
173
 
166
174
  dps.each do |d|
167
- subject.parse(strip_tags(d), :tags)[0].should == d
175
+ expect(subject.parse(strip_tags(d), :tags)[0]).to eq(d)
168
176
  end
169
177
  end
170
178
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.9
4
+ version: 0.6.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sylvester Keil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-16 00:00:00.000000000 Z
11
+ date: 2014-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bibtex-ruby
@@ -64,20 +64,14 @@ dependencies:
64
64
  requirements:
65
65
  - - ~>
66
66
  - !ruby/object:Gem::Version
67
- version: '0.8'
68
- - - '>='
69
- - !ruby/object:Gem::Version
70
- version: 0.8.7
67
+ version: '0.9'
71
68
  type: :runtime
72
69
  prerelease: false
73
70
  version_requirements: !ruby/object:Gem::Requirement
74
71
  requirements:
75
72
  - - ~>
76
73
  - !ruby/object:Gem::Version
77
- version: '0.8'
78
- - - '>='
79
- - !ruby/object:Gem::Version
80
- version: 0.8.7
74
+ version: '0.9'
81
75
  description: A sophisticated parser for academic reference lists and bibliographies
82
76
  based on machine learning algorithms using conditional random fields.
83
77
  email: