anystyle-parser 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,19 +1,19 @@
1
1
  module Anystyle
2
-
3
- def self.parse(*arguments)
4
- Parser::Parser.instance.parse(*arguments)
5
- end
6
-
7
- def self.parser
8
- Parser::Parser.instance
9
- end
10
-
11
- module Parser
12
-
13
- def self.instance
14
- Parser.instance
15
- end
16
-
17
- end
18
-
19
- end
2
+
3
+ def self.parse(*arguments)
4
+ Parser::Parser.instance.parse(*arguments)
5
+ end
6
+
7
+ def self.parser
8
+ Parser::Parser.instance
9
+ end
10
+
11
+ module Parser
12
+
13
+ def self.instance
14
+ Parser.instance
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,5 +1,5 @@
1
1
  module Anystyle
2
2
  module Parser
3
- VERSION = '0.2.1'.freeze
3
+ VERSION = '0.3.0'.freeze
4
4
  end
5
5
  end
@@ -1,120 +1,124 @@
1
1
  module Anystyle::Parser
2
2
  describe Parser do
3
-
3
+
4
4
  it { should_not be nil }
5
-
6
- describe "#tokenize" do
7
- it "returns [] when given an empty string" do
8
- subject.tokenize('').should == []
9
- end
10
-
11
- it "takes a single line and returns an array of token sequences" do
12
- subject.tokenize('hello, world!').should == [%w{ hello, world! }]
13
- end
14
-
15
- it "takes two lines and returns an array of token sequences" do
16
- subject.tokenize("hello, world!\ngoodbye!").should == [%w{ hello, world! }, %w{ goodbye! }]
17
- end
18
-
19
- context "when passing a string marked as tagged" do
20
- it "returns [] when given an empty string" do
21
- subject.tokenize('', true).should == []
22
- end
23
-
24
- it "returns an array of :unknown token sequences when given an untagged single line" do
25
- subject.tokenize('hello, world!', true).should == [[['hello,', :unknown], ['world!', :unknown]]]
26
- end
27
-
28
- it "returns an array of :unknown token sequences when given two untagged lines" do
29
- subject.tokenize("hello,\nworld!", true).should == [[['hello,', :unknown]], [['world!', :unknown]]]
30
- end
31
-
32
- it "returns an array of token/tag pair for each line when given a single tagged string" do
33
- subject.tokenize('<a>hello</a>', true).should == [[['hello', :a]]]
34
- end
35
-
36
- it "returns an array of token/tag pair for each line when given a string with multiple tags" do
37
- subject.tokenize('<a>hello world</a> <b> !</b>', true).should == [[['hello',:a], ['world', :a], ['!', :b]]]
38
- end
39
-
40
- it "raises an argument error if the string contains mismatched tags" do
41
- expect { subject.tokenize('<a> hello </b>', true) }.to raise_error(ArgumentError)
42
- expect { subject.tokenize('<a> hello <b> world </a>', true) }.to raise_error(ArgumentError)
43
- end
44
- end
45
-
46
- end
47
-
48
- describe "#prepare" do
49
- it 'returns an array of expanded token sequences' do
50
- subject.prepare('hello, world!').should == [['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']]
51
- end
52
-
53
- context 'when marking the input as being tagged' do
54
- let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
55
-
56
- it 'returns an array of expaned and labelled token sequences for a tagged string' do
57
- subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.should == %w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date }
58
- end
59
-
60
- it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
61
- subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.should == %w{ unknown unknown }
62
- end
63
-
64
- end
65
- end
66
-
67
- describe "#label" do
68
- let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
69
-
70
- it 'returns an array of labelled segments' do
71
- subject.label(citation)[0].map(&:first).should == [:author, :title, :location, :publisher, :date, :pages]
72
- end
73
-
74
- describe 'when passed more than one line' do
75
- it 'returns two arrays' do
76
- subject.label("foo\nbar").should have(2).elements
77
- end
78
- end
79
-
80
- describe 'when passed invalid input' do
81
- it 'returns an empty array for an empty string' do
82
- subject.label('').should == []
83
- end
84
-
85
- it 'returns an empty array for an empty line' do
86
- subject.label("\n").should == []
87
- subject.label("\n ").should == [[],[]]
88
- subject.label(" \n ").should == [[],[]]
89
- subject.label(" \n").should == [[]]
90
- end
91
-
92
- it 'does not fail for unrecognizable input' do
93
- lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
94
- lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
95
-
96
- pending
97
- lambda { subject.label("\n doi ") }.should_not raise_error
98
- end
99
- end
100
-
101
-
102
- end
103
-
104
- describe "#parse" do
105
- let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
106
-
107
- it 'returns a hash of label/segment pairs by default' do
108
- subject.parse(citation)[0].should == { :author => 'Perec, Georges', :title => 'A Void', :location => 'London', :publisher => 'The Harvill Press', :year => 1995, :pages => '108', :type => :book }
109
- end
110
-
111
- describe 'using output format "tags"' do
112
- it 'returns a tagged string' do
113
- subject.parse(citation, :tags)[0].should == '<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>'
114
- end
115
- end
116
- end
117
-
118
-
5
+
6
+ describe "#tokenize" do
7
+ it "returns [] when given an empty string" do
8
+ subject.tokenize('').should == []
9
+ end
10
+
11
+ it "takes a single line and returns an array of token sequences" do
12
+ subject.tokenize('hello, world!').should == [%w{ hello, world! }]
13
+ end
14
+
15
+ it "takes two lines and returns an array of token sequences" do
16
+ subject.tokenize("hello, world!\ngoodbye!").should == [%w{ hello, world! }, %w{ goodbye! }]
17
+ end
18
+
19
+ context "when passing a string marked as tagged" do
20
+ it "returns [] when given an empty string" do
21
+ subject.tokenize('', true).should == []
22
+ end
23
+
24
+ it "returns an array of :unknown token sequences when given an untagged single line" do
25
+ subject.tokenize('hello, world!', true).should == [[['hello,', :unknown], ['world!', :unknown]]]
26
+ end
27
+
28
+ it "returns an array of :unknown token sequences when given two untagged lines" do
29
+ subject.tokenize("hello,\nworld!", true).should == [[['hello,', :unknown]], [['world!', :unknown]]]
30
+ end
31
+
32
+ it "returns an array of token/tag pair for each line when given a single tagged string" do
33
+ subject.tokenize('<a>hello</a>', true).should == [[['hello', :a]]]
34
+ end
35
+
36
+ it "returns an array of token/tag pair for each line when given a string with multiple tags" do
37
+ subject.tokenize('<a>hello world</a> <b> !</b>', true).should == [[['hello',:a], ['world', :a], ['!', :b]]]
38
+ end
39
+
40
+ it "raises an argument error if the string contains mismatched tags" do
41
+ expect { subject.tokenize('<a> hello </b>', true) }.to raise_error(ArgumentError)
42
+ expect { subject.tokenize('<a> hello <b> world </a>', true) }.to raise_error(ArgumentError)
43
+ end
44
+ end
45
+
46
+ end
47
+
48
+ describe "#prepare" do
49
+ it 'returns an array of expanded token sequences' do
50
+ subject.prepare('hello, world!').should == [['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']]
51
+ end
52
+
53
+ context 'when marking the input as being tagged' do
54
+ let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
55
+
56
+ it 'returns an array of expaned and labelled token sequences for a tagged string' do
57
+ subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.should == %w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date }
58
+ end
59
+
60
+ it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
61
+ subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.should == %w{ unknown unknown }
62
+ end
63
+
64
+ end
65
+ end
66
+
67
+ describe "#label" do
68
+ let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
69
+
70
+ it 'returns an array of labelled segments' do
71
+ subject.label(citation)[0].map(&:first).should == [:author, :title, :location, :publisher, :date, :pages]
72
+ end
73
+
74
+ describe 'when passed more than one line' do
75
+ it 'returns two arrays' do
76
+ subject.label("foo\nbar").should have(2).elements
77
+ end
78
+ end
79
+
80
+ describe 'when passed invalid input' do
81
+ it 'returns an empty array for an empty string' do
82
+ subject.label('').should == []
83
+ end
84
+
85
+ it 'returns an empty array for an empty line' do
86
+ subject.label("\n").should == []
87
+ subject.label("\n ").should == [[],[]]
88
+ subject.label(" \n ").should == [[],[]]
89
+ subject.label(" \n").should == [[]]
90
+ end
91
+
92
+ it 'does not fail for unrecognizable input' do
93
+ lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
94
+ lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
95
+
96
+ pending
97
+ lambda { subject.label("\n doi ") }.should_not raise_error
98
+ end
99
+ end
100
+
101
+
102
+ end
103
+
104
+ describe "#parse" do
105
+ let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
106
+
107
+ it 'returns a hash of label/segment pairs by default' do
108
+ subject.parse(citation)[0].should == { :author => 'Perec, Georges', :title => 'A Void', :location => 'London', :publisher => 'The Harvill Press', :year => 1995, :pages => '108', :type => :book }
109
+ end
110
+
111
+ describe 'using output format "tags"' do
112
+ it 'returns a tagged string' do
113
+ subject.parse(citation, :tags)[0].should == '<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>'
114
+ end
115
+ end
116
+
117
+ it 'returns the label/token arrays for format "raw"' do
118
+ subject.parse(citation, :raw)[0][0].should == [:author, 'Perec,']
119
+ end
120
+ end
121
+
122
+
119
123
  end
120
124
  end
@@ -1,17 +1,24 @@
1
1
  begin
2
2
  require 'simplecov'
3
+ require 'coveralls' if ENV['CI']
3
4
  rescue LoadError
4
5
  # ignore
5
6
  end
6
7
 
7
8
  begin
8
- require 'debugger'
9
+ case
10
+ when defined?(RUBY_ENGINE) && RUBY_ENGINE == 'rbx'
11
+ require 'rubinius/debugger'
12
+ else
13
+ require 'debugger'
14
+ end
9
15
  rescue LoadError
10
16
  # ignore
11
17
  end
12
18
 
13
19
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
20
  $LOAD_PATH.unshift(File.dirname(__FILE__))
21
+
15
22
  require 'rspec'
16
23
  require 'anystyle/parser'
17
24
 
@@ -20,5 +27,5 @@ require 'anystyle/parser'
20
27
  Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
21
28
 
22
29
  RSpec.configure do |config|
23
-
30
+
24
31
  end
metadata CHANGED
@@ -1,66 +1,59 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anystyle-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
5
- prerelease:
4
+ version: 0.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Sylvester Keil
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-11-08 00:00:00.000000000 Z
11
+ date: 2014-02-07 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: bibtex-ruby
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: '2.0'
19
+ version: '3.0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ~>
24
+ - - "~>"
28
25
  - !ruby/object:Gem::Version
29
- version: '2.0'
26
+ version: '3.0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: wapiti
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ~>
31
+ - - "~>"
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0.0'
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ~>
38
+ - - "~>"
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0.0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: namae
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ~>
45
+ - - "~>"
52
46
  - !ruby/object:Gem::Version
53
- version: 0.7.1
47
+ version: '0.8'
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ~>
52
+ - - "~>"
60
53
  - !ruby/object:Gem::Version
61
- version: 0.7.1
62
- description: A sophisticated parser for academic references based on machine learning
63
- algorithms using conditional random fields.
54
+ version: '0.8'
55
+ description: A sophisticated parser for academic reference lists and bibliographies
56
+ based on machine learning algorithms using conditional random fields.
64
57
  email:
65
58
  - http://sylvester.keil.or.at
66
59
  executables: []
@@ -69,9 +62,6 @@ extra_rdoc_files:
69
62
  - README.md
70
63
  - LICENSE
71
64
  files:
72
- - .autotest
73
- - .gitignore
74
- - .rspec
75
65
  - Gemfile
76
66
  - HISTORY.md
77
67
  - LICENSE
@@ -102,40 +92,33 @@ files:
102
92
  homepage: http://github.com/inukshuk/anystyle-parser
103
93
  licenses:
104
94
  - FreeBSD
95
+ metadata: {}
105
96
  post_install_message:
106
97
  rdoc_options:
107
- - --line-numbers
108
- - --inline-source
109
- - --title
110
- - ! '"Anystyle Parser"'
111
- - --main
98
+ - "--line-numbers"
99
+ - "--inline-source"
100
+ - "--title"
101
+ - "\"Anystyle Parser\""
102
+ - "--main"
112
103
  - README.md
113
104
  require_paths:
114
105
  - lib
115
106
  required_ruby_version: !ruby/object:Gem::Requirement
116
- none: false
117
107
  requirements:
118
- - - ! '>='
108
+ - - ">="
119
109
  - !ruby/object:Gem::Version
120
- version: '0'
121
- segments:
122
- - 0
123
- hash: -2142174744936810203
110
+ version: 1.9.3
124
111
  required_rubygems_version: !ruby/object:Gem::Requirement
125
- none: false
126
112
  requirements:
127
- - - ! '>='
113
+ - - ">="
128
114
  - !ruby/object:Gem::Version
129
115
  version: '0'
130
- segments:
131
- - 0
132
- hash: -2142174744936810203
133
116
  requirements: []
134
117
  rubyforge_project:
135
- rubygems_version: 1.8.24
118
+ rubygems_version: 2.2.1
136
119
  signing_key:
137
- specification_version: 3
138
- summary: Parser for academic references.
120
+ specification_version: 4
121
+ summary: Smart and fast academic bibliography parser.
139
122
  test_files:
140
123
  - features/step_definitions/parser_steps.rb
141
124
  - features/support/env.rb