anystyle-parser 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +28 -12
- data/HISTORY.md +6 -0
- data/LICENSE +2 -2
- data/README.md +11 -11
- data/Rakefile +14 -3
- data/anystyle-parser.gemspec +13 -8
- data/features/support/env.rb +18 -0
- data/lib/anystyle/parser/dictionary.rb +35 -37
- data/lib/anystyle/parser/errors.rb +18 -18
- data/lib/anystyle/parser/parser.rb +254 -244
- data/lib/anystyle/parser/utility.rb +18 -18
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/parser_spec.rb +119 -115
- data/spec/spec_helper.rb +9 -2
- metadata +26 -43
- data/.autotest +0 -0
- data/.gitignore +0 -5
- data/.rspec +0 -3
@@ -1,19 +1,19 @@
|
|
1
1
|
module Anystyle
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
2
|
+
|
3
|
+
def self.parse(*arguments)
|
4
|
+
Parser::Parser.instance.parse(*arguments)
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.parser
|
8
|
+
Parser::Parser.instance
|
9
|
+
end
|
10
|
+
|
11
|
+
module Parser
|
12
|
+
|
13
|
+
def self.instance
|
14
|
+
Parser.instance
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -1,120 +1,124 @@
|
|
1
1
|
module Anystyle::Parser
|
2
2
|
describe Parser do
|
3
|
-
|
3
|
+
|
4
4
|
it { should_not be nil }
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
5
|
+
|
6
|
+
describe "#tokenize" do
|
7
|
+
it "returns [] when given an empty string" do
|
8
|
+
subject.tokenize('').should == []
|
9
|
+
end
|
10
|
+
|
11
|
+
it "takes a single line and returns an array of token sequences" do
|
12
|
+
subject.tokenize('hello, world!').should == [%w{ hello, world! }]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "takes two lines and returns an array of token sequences" do
|
16
|
+
subject.tokenize("hello, world!\ngoodbye!").should == [%w{ hello, world! }, %w{ goodbye! }]
|
17
|
+
end
|
18
|
+
|
19
|
+
context "when passing a string marked as tagged" do
|
20
|
+
it "returns [] when given an empty string" do
|
21
|
+
subject.tokenize('', true).should == []
|
22
|
+
end
|
23
|
+
|
24
|
+
it "returns an array of :unknown token sequences when given an untagged single line" do
|
25
|
+
subject.tokenize('hello, world!', true).should == [[['hello,', :unknown], ['world!', :unknown]]]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "returns an array of :unknown token sequences when given two untagged lines" do
|
29
|
+
subject.tokenize("hello,\nworld!", true).should == [[['hello,', :unknown]], [['world!', :unknown]]]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "returns an array of token/tag pair for each line when given a single tagged string" do
|
33
|
+
subject.tokenize('<a>hello</a>', true).should == [[['hello', :a]]]
|
34
|
+
end
|
35
|
+
|
36
|
+
it "returns an array of token/tag pair for each line when given a string with multiple tags" do
|
37
|
+
subject.tokenize('<a>hello world</a> <b> !</b>', true).should == [[['hello',:a], ['world', :a], ['!', :b]]]
|
38
|
+
end
|
39
|
+
|
40
|
+
it "raises an argument error if the string contains mismatched tags" do
|
41
|
+
expect { subject.tokenize('<a> hello </b>', true) }.to raise_error(ArgumentError)
|
42
|
+
expect { subject.tokenize('<a> hello <b> world </a>', true) }.to raise_error(ArgumentError)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "#prepare" do
|
49
|
+
it 'returns an array of expanded token sequences' do
|
50
|
+
subject.prepare('hello, world!').should == [['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']]
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when marking the input as being tagged' do
|
54
|
+
let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
|
55
|
+
|
56
|
+
it 'returns an array of expaned and labelled token sequences for a tagged string' do
|
57
|
+
subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.should == %w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date }
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
|
61
|
+
subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.should == %w{ unknown unknown }
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe "#label" do
|
68
|
+
let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
|
69
|
+
|
70
|
+
it 'returns an array of labelled segments' do
|
71
|
+
subject.label(citation)[0].map(&:first).should == [:author, :title, :location, :publisher, :date, :pages]
|
72
|
+
end
|
73
|
+
|
74
|
+
describe 'when passed more than one line' do
|
75
|
+
it 'returns two arrays' do
|
76
|
+
subject.label("foo\nbar").should have(2).elements
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe 'when passed invalid input' do
|
81
|
+
it 'returns an empty array for an empty string' do
|
82
|
+
subject.label('').should == []
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'returns an empty array for an empty line' do
|
86
|
+
subject.label("\n").should == []
|
87
|
+
subject.label("\n ").should == [[],[]]
|
88
|
+
subject.label(" \n ").should == [[],[]]
|
89
|
+
subject.label(" \n").should == [[]]
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'does not fail for unrecognizable input' do
|
93
|
+
lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
|
94
|
+
lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
|
95
|
+
|
96
|
+
pending
|
97
|
+
lambda { subject.label("\n doi ") }.should_not raise_error
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "#parse" do
|
105
|
+
let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
|
106
|
+
|
107
|
+
it 'returns a hash of label/segment pairs by default' do
|
108
|
+
subject.parse(citation)[0].should == { :author => 'Perec, Georges', :title => 'A Void', :location => 'London', :publisher => 'The Harvill Press', :year => 1995, :pages => '108', :type => :book }
|
109
|
+
end
|
110
|
+
|
111
|
+
describe 'using output format "tags"' do
|
112
|
+
it 'returns a tagged string' do
|
113
|
+
subject.parse(citation, :tags)[0].should == '<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>'
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'returns the label/token arrays for format "raw"' do
|
118
|
+
subject.parse(citation, :raw)[0][0].should == [:author, 'Perec,']
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
|
119
123
|
end
|
120
124
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,17 +1,24 @@
|
|
1
1
|
begin
|
2
2
|
require 'simplecov'
|
3
|
+
require 'coveralls' if ENV['CI']
|
3
4
|
rescue LoadError
|
4
5
|
# ignore
|
5
6
|
end
|
6
7
|
|
7
8
|
begin
|
8
|
-
|
9
|
+
case
|
10
|
+
when defined?(RUBY_ENGINE) && RUBY_ENGINE == 'rbx'
|
11
|
+
require 'rubinius/debugger'
|
12
|
+
else
|
13
|
+
require 'debugger'
|
14
|
+
end
|
9
15
|
rescue LoadError
|
10
16
|
# ignore
|
11
17
|
end
|
12
18
|
|
13
19
|
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
20
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
21
|
+
|
15
22
|
require 'rspec'
|
16
23
|
require 'anystyle/parser'
|
17
24
|
|
@@ -20,5 +27,5 @@ require 'anystyle/parser'
|
|
20
27
|
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
21
28
|
|
22
29
|
RSpec.configure do |config|
|
23
|
-
|
30
|
+
|
24
31
|
end
|
metadata
CHANGED
@@ -1,66 +1,59 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anystyle-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.3.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Sylvester Keil
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-02-07 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: bibtex-ruby
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - ~>
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
19
|
+
version: '3.0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- - ~>
|
24
|
+
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
26
|
+
version: '3.0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: wapiti
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- - ~>
|
31
|
+
- - "~>"
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0.0'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- - ~>
|
38
|
+
- - "~>"
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0.0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: namae
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- - ~>
|
45
|
+
- - "~>"
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: 0.
|
47
|
+
version: '0.8'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- - ~>
|
52
|
+
- - "~>"
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0.
|
62
|
-
description: A sophisticated parser for academic
|
63
|
-
algorithms using conditional random fields.
|
54
|
+
version: '0.8'
|
55
|
+
description: A sophisticated parser for academic reference lists and bibliographies
|
56
|
+
based on machine learning algorithms using conditional random fields.
|
64
57
|
email:
|
65
58
|
- http://sylvester.keil.or.at
|
66
59
|
executables: []
|
@@ -69,9 +62,6 @@ extra_rdoc_files:
|
|
69
62
|
- README.md
|
70
63
|
- LICENSE
|
71
64
|
files:
|
72
|
-
- .autotest
|
73
|
-
- .gitignore
|
74
|
-
- .rspec
|
75
65
|
- Gemfile
|
76
66
|
- HISTORY.md
|
77
67
|
- LICENSE
|
@@ -102,40 +92,33 @@ files:
|
|
102
92
|
homepage: http://github.com/inukshuk/anystyle-parser
|
103
93
|
licenses:
|
104
94
|
- FreeBSD
|
95
|
+
metadata: {}
|
105
96
|
post_install_message:
|
106
97
|
rdoc_options:
|
107
|
-
- --line-numbers
|
108
|
-
- --inline-source
|
109
|
-
- --title
|
110
|
-
-
|
111
|
-
- --main
|
98
|
+
- "--line-numbers"
|
99
|
+
- "--inline-source"
|
100
|
+
- "--title"
|
101
|
+
- "\"Anystyle Parser\""
|
102
|
+
- "--main"
|
112
103
|
- README.md
|
113
104
|
require_paths:
|
114
105
|
- lib
|
115
106
|
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
-
none: false
|
117
107
|
requirements:
|
118
|
-
- -
|
108
|
+
- - ">="
|
119
109
|
- !ruby/object:Gem::Version
|
120
|
-
version:
|
121
|
-
segments:
|
122
|
-
- 0
|
123
|
-
hash: -2142174744936810203
|
110
|
+
version: 1.9.3
|
124
111
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
-
none: false
|
126
112
|
requirements:
|
127
|
-
- -
|
113
|
+
- - ">="
|
128
114
|
- !ruby/object:Gem::Version
|
129
115
|
version: '0'
|
130
|
-
segments:
|
131
|
-
- 0
|
132
|
-
hash: -2142174744936810203
|
133
116
|
requirements: []
|
134
117
|
rubyforge_project:
|
135
|
-
rubygems_version:
|
118
|
+
rubygems_version: 2.2.1
|
136
119
|
signing_key:
|
137
|
-
specification_version:
|
138
|
-
summary:
|
120
|
+
specification_version: 4
|
121
|
+
summary: Smart and fast academic bibliography parser.
|
139
122
|
test_files:
|
140
123
|
- features/step_definitions/parser_steps.rb
|
141
124
|
- features/support/env.rb
|