anystyle-parser 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -2
- data/HISTORY.md +6 -0
- data/anystyle-parser.gemspec +1 -1
- data/lib/anystyle/parser/features.rb +208 -208
- data/lib/anystyle/parser/normalizer.rb +359 -359
- data/lib/anystyle/parser/parser.rb +28 -10
- data/lib/anystyle/parser/support/anystyle.mod +32347 -5039
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/features_spec.rb +27 -21
- data/spec/anystyle/parser/normalizer_spec.rb +83 -62
- data/spec/anystyle/parser/parser_spec.rb +49 -6
- data/spec/fixtures/train_dps.txt +12 -0
- data/spec/spec_helper.rb +15 -3
- metadata +7 -5
@@ -1,24 +1,30 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
module Anystyle::Parser
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
4
|
+
describe "Features" do
|
5
|
+
|
6
|
+
describe "numbers" do
|
7
|
+
let(:f) { Parser.feature[:numbers] }
|
8
|
+
|
9
|
+
%w{ (1992) 1992 2011 1776 }.each do |year|
|
10
|
+
it "returns :year for #{year.inspect}" do
|
11
|
+
f.match(year).should == :year
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
%w{ (1) (12) (123) }.each do |year|
|
16
|
+
it "returns :year for #{year.inspect}" do
|
17
|
+
f.match(year).should == :numeric
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
['pp', 'pp.', '23-4', '6124--19', '48 - 9', '19–27'].each do |page|
|
22
|
+
it "returns :page for #{page.inspect}" do
|
23
|
+
f.match(page).should == :page
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -1,63 +1,84 @@
|
|
1
1
|
module Anystyle
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
describe "Normalizer" do
|
5
|
+
let(:n) { Normalizer.instance }
|
6
|
+
|
7
|
+
describe "#tokenize_names" do
|
8
|
+
|
9
|
+
it "tokenizes 'A B'" do
|
10
|
+
Normalizer.instance.normalize_names('A B').should == 'B, A'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "tokenizes 'A, B'" do
|
14
|
+
Normalizer.instance.normalize_names('A, B').should == 'A, B'
|
15
|
+
end
|
16
|
+
|
17
|
+
# it "tokenizes 'A, jr., B'" do
|
18
|
+
# Normalizer.instance.normalize_names('A, jr., B').should == 'A, jr., B'
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# it "tokenizes 'A, B, jr.'" do
|
22
|
+
# Normalizer.instance.normalize_names('A, B, jr.').should == 'A, B, jr.'
|
23
|
+
# end
|
24
|
+
|
25
|
+
it "tokenizes 'A, B, C, D'" do
|
26
|
+
Normalizer.instance.normalize_names('A, B, C, D').should == 'A, B and C, D'
|
27
|
+
end
|
28
|
+
|
29
|
+
it "tokenizes 'A, B, C'" do
|
30
|
+
Normalizer.instance.normalize_names('A, B, C').should == 'A, B and C'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "tokenizes 'Aa Bb, C.'" do
|
34
|
+
Normalizer.instance.normalize_names('Aa Bb, C.').should == 'Aa Bb, C.'
|
35
|
+
end
|
36
|
+
|
37
|
+
it "tokenizes 'Aa Bb, Cc Dd, and E F G'" do
|
38
|
+
Normalizer.instance.normalize_names('Aa Bb, C D, and E F G').should == 'Bb, Aa and D, C and G, E F'
|
39
|
+
end
|
40
|
+
|
41
|
+
[
|
42
|
+
['Poe, Edgar A.', 'Poe, Edgar A.'],
|
43
|
+
['Edgar A. Poe', 'Poe, Edgar A.'],
|
44
|
+
['Edgar A. Poe, Herman Melville', 'Poe, Edgar A. and Melville, Herman'],
|
45
|
+
['Poe, Edgar A., Melville, Herman', 'Poe, Edgar A. and Melville, Herman'],
|
46
|
+
['Aeschlimann Magnin, E.', 'Aeschlimann Magnin, E.']
|
47
|
+
].each do |name, normalized|
|
48
|
+
it "tokenizes #{name.inspect}" do
|
49
|
+
Normalizer.instance.normalize_names(name).should == normalized
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
describe 'editors extraction' do
|
56
|
+
it 'recognizes editors in the author field' do
|
57
|
+
n.normalize_author(:author => 'D. Knuth (ed.)').should == { :editor => 'Knuth, D.' }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe 'date extraction' do
|
62
|
+
it 'extracts month and year from a string like "(July 2009)"' do
|
63
|
+
h = Normalizer.instance.normalize_date(:date => '(July 2009)')
|
64
|
+
h[:year].should == 2009
|
65
|
+
h[:month].should == 7
|
66
|
+
h.should_not have_key(:date)
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'extracts month and year from a string like "(1997 Sept.)"' do
|
70
|
+
h = Normalizer.instance.normalize_date(:date => '(1997 Sept.)')
|
71
|
+
h[:year].should == 1997
|
72
|
+
h[:month].should == 9
|
73
|
+
h.should_not have_key(:date)
|
74
|
+
|
75
|
+
h = Normalizer.instance.normalize_date(:date => '(1997 Okt.)')
|
76
|
+
h[:year].should == 1997
|
77
|
+
h[:month].should == 10
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
1
3
|
module Anystyle::Parser
|
2
4
|
describe Parser do
|
3
5
|
|
@@ -82,18 +84,16 @@ module Anystyle::Parser
|
|
82
84
|
subject.label('').should == []
|
83
85
|
end
|
84
86
|
|
85
|
-
it 'returns an empty array for
|
87
|
+
it 'returns an empty array for empty lines' do
|
86
88
|
subject.label("\n").should == []
|
87
|
-
subject.label("\n ").should == [
|
88
|
-
subject.label(" \n ").should == [
|
89
|
-
subject.label(" \n").should == [
|
89
|
+
subject.label("\n ").should == []
|
90
|
+
subject.label(" \n ").should == []
|
91
|
+
subject.label(" \n").should == []
|
90
92
|
end
|
91
93
|
|
92
94
|
it 'does not fail for unrecognizable input' do
|
93
95
|
lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
|
94
96
|
lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
|
95
|
-
|
96
|
-
pending
|
97
97
|
lambda { subject.label("\n doi ") }.should_not raise_error
|
98
98
|
end
|
99
99
|
end
|
@@ -117,8 +117,51 @@ module Anystyle::Parser
|
|
117
117
|
it 'returns the label/token arrays for format "raw"' do
|
118
118
|
subject.parse(citation, :raw)[0][0].should == [:author, 'Perec,']
|
119
119
|
end
|
120
|
+
|
121
|
+
it 'returns the token in original order for format "raw"' do
|
122
|
+
subject.parse(citation, :raw)[0].map(&:last).join(' ').should == citation
|
123
|
+
|
124
|
+
difference = 'Derrida, J. (1967). L’écriture et la différence (1 éd.). Paris: Éditions du Seuil.'
|
125
|
+
subject.parse(difference, :raw)[0].map(&:last).join(' ').should == difference
|
126
|
+
end
|
120
127
|
end
|
121
128
|
|
129
|
+
describe "#train" do
|
130
|
+
let(:dps) { File.open(fixture_path('train_dps.txt'), 'r:UTF-8').read.split(/\n/) }
|
131
|
+
|
132
|
+
describe "a pristine model" do
|
133
|
+
before(:each) { subject.train '', true }
|
134
|
+
|
135
|
+
it 'recognizes trained references' do
|
136
|
+
subject.learn dps[0]
|
137
|
+
subject.parse(strip_tags(dps[0]), :tags)[0].should == dps[0]
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'recognizes trained references when learnt in one go' do
|
141
|
+
subject.learn dps
|
142
|
+
|
143
|
+
dps.each do |d|
|
144
|
+
subject.parse(strip_tags(d), :tags)[0].should == d
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'recognizes trained references when learnt separately' do
|
149
|
+
pending
|
150
|
+
|
151
|
+
dps.each do |d|
|
152
|
+
subject.learn d
|
153
|
+
end
|
154
|
+
|
155
|
+
dps.each do |d|
|
156
|
+
subject.parse(strip_tags(d), :tags)[0].should == d
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
describe "the default model" do
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
122
165
|
|
123
166
|
end
|
124
167
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<author>Williams, J.B., J.D. Shorthouse and R.E. Lee, Jr.</author> <date>2002.</date> <title>Extreme resistance to desiccation and microclimate-related differences in cold-hardiness of gall wasps (Hymenoptera; Cynipidae) overwintering on roses in southern Canada.</title> <journal>The Journal of Experimental Biology</journal> <volume>205:</volume> <pages>2115–2124.</pages>
|
2
|
+
<author>MACDONALD, S., & FENNIAK T.</author> <date>(2007).</date> <title>Understory plant communities of boreal mixedwood forests in western Canada: Natural patterns and response to variable-retention harvesting.</title> <journal>Forest Ecology and Management.</journal> <volume>242(1):</volume> <pages>34–48.</pages>
|
3
|
+
<author>Harris, P. and J.D. Shorthouse.</author> <date>1996.</date> <title>Effectiveness of gall inducers in weed biological control.</title> <journal>The Canadian Entomologist</journal> <volume>128:</volume> <pages>1021–1055.</pages>
|
4
|
+
<author>Williams, J.B., J.D. Shorthouse and R.E. Lee, Jr.</author> <date>2003.</date> <title>Deleterious effects of mild simulated overwintering temperatures on survival and potential fecundity of rose-galling Diplolepis wasps (Hymenoptera: Cynipidae).</title> <journal>Journal of Experimental Zoology</journal> <volume>298A:</volume> <pages>23–31.</pages>
|
5
|
+
<author>Shorthouse, J.D., H. Goulet and D.P. Shorthouse.</author> <date>2003.</date> <title>Notes on cynipid galls, ground beetles and ground-dwelling spiders collected at Fort Severn, Ontario.</title> <journal>Arctic</journal> <volume>56:</volume> <pages>159–167.</pages>
|
6
|
+
<author>Epling, C., Lewis H., & Ball F. M.</author> <date>(1960).</date> <title>The Breeding Group and Seed Storage: A Study in Population Dynamics.</title> <journal>Evolution.</journal> <volume>14,</volume> <pages>238-255.</pages>
|
7
|
+
<author>Bagatto, Giuseppe, Louise C. Paquette, and Joseph D. Shorthouse.</author> <date>1995.</date> <title>Influence of galls of Phanacis taraxaci on carbon partitioning within common dandelion, Taraxacum officinale.</title> <journal>Entomologia Experimentalis et Applicata</journal> <volume>79:</volume> <pages>111–117.</pages>
|
8
|
+
<author>Shorthouse, J.D.</author> <date>1993.</date> <title>Adaptations of gall wasps of the genus Diplolepis (Hymenoptera: Cynipidae) and the role of gall anatomy in cynipid systematics.</title> <journal>Memoirs of the Entomological Society of Canada</journal> <volume>165:</volume> <pages>139–163.</pages>
|
9
|
+
<author>Bagatto, G., and J.D. Shorthouse.</author> <date>2000.</date> <title>Evaluation of municipal solid waste (MSW) compost as a soil amendment for acidic, metalliferous mine tailings.</title> <journal>International Journal of Surface Mining, Reclamation and Environment</journal> <volume>14:</volume> <pages>205–214.</pages>
|
10
|
+
<author>Shorthouse, J.D., D. Wool, and A. Raman</author> <date>2005.</date> <title>Gall-inducing insects - nature's most sophisticated herbivores.</title> <journal>Basic and Applied Ecology</journal> <volume>6:</volume> <pages>407–411.</pages>
|
11
|
+
<author>Sopow, S.L., J.D. Shorthouse, W. Strong, and D.T. Quiring.</author> <date>2003.</date> <title>Evidence for long-distance, chemical gall induction by an insect.</title> <journal>Ecology Letters</journal> <volume>6:</volume> <pages>102–105.</pages>
|
12
|
+
<author>Shorthouse, J.D. and J.J. Leggo.</author> <date>2002.</date> <title>Immature stages of the galler Diplolepis triforma (Hymenoptera: Cynipidae) with comments on the role of its prepupa.</title> <journal>The Canadian Entomologist</journal> <volume>134</volume> <pages>433–446.</pages>
|
data/spec/spec_helper.rb
CHANGED
@@ -22,10 +22,22 @@ $LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
22
22
|
require 'rspec'
|
23
23
|
require 'anystyle/parser'
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
module Fixtures
|
26
|
+
PATH = File.expand_path('../fixtures', __FILE__)
|
27
|
+
|
28
|
+
Dir[File.join(PATH, '*.rb')].each do |fixture|
|
29
|
+
require fixture
|
30
|
+
end
|
31
|
+
|
32
|
+
def fixture_path(path)
|
33
|
+
File.join(PATH, path)
|
34
|
+
end
|
35
|
+
end
|
28
36
|
|
29
37
|
RSpec.configure do |config|
|
38
|
+
config.include Fixtures
|
30
39
|
|
40
|
+
def strip_tags(string)
|
41
|
+
string.gsub(/<[^>]+>/, '')
|
42
|
+
end
|
31
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bibtex-ruby
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0.
|
33
|
+
version: '0.1'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0.
|
40
|
+
version: '0.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: namae
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -87,6 +87,7 @@ files:
|
|
87
87
|
- spec/anystyle/parser/normalizer_spec.rb
|
88
88
|
- spec/anystyle/parser/parser_spec.rb
|
89
89
|
- spec/benchmark.rb
|
90
|
+
- spec/fixtures/train_dps.txt
|
90
91
|
- spec/profile.rb
|
91
92
|
- spec/spec_helper.rb
|
92
93
|
homepage: http://github.com/inukshuk/anystyle-parser
|
@@ -115,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
115
116
|
version: '0'
|
116
117
|
requirements: []
|
117
118
|
rubyforge_project:
|
118
|
-
rubygems_version: 2.2.
|
119
|
+
rubygems_version: 2.2.2
|
119
120
|
signing_key:
|
120
121
|
specification_version: 4
|
121
122
|
summary: Smart and fast academic bibliography parser.
|
@@ -127,6 +128,7 @@ test_files:
|
|
127
128
|
- spec/anystyle/parser/normalizer_spec.rb
|
128
129
|
- spec/anystyle/parser/parser_spec.rb
|
129
130
|
- spec/benchmark.rb
|
131
|
+
- spec/fixtures/train_dps.txt
|
130
132
|
- spec/profile.rb
|
131
133
|
- spec/spec_helper.rb
|
132
134
|
has_rdoc:
|