RubyGems - anystyle-parser - Versions diffs - 0.2.1 → 0.3.0 - Mend

anystyle-parser 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +7 -0
data/Gemfile +28 -12
data/HISTORY.md +6 -0
data/LICENSE +2 -2
data/README.md +11 -11
data/Rakefile +14 -3
data/anystyle-parser.gemspec +13 -8
data/features/support/env.rb +18 -0
data/lib/anystyle/parser/dictionary.rb +35 -37
data/lib/anystyle/parser/errors.rb +18 -18
data/lib/anystyle/parser/parser.rb +254 -244
data/lib/anystyle/parser/utility.rb +18 -18
data/lib/anystyle/parser/version.rb +1 -1
data/spec/anystyle/parser/parser_spec.rb +119 -115
data/spec/spec_helper.rb +9 -2
metadata +26 -43
data/.autotest +0 -0
data/.gitignore +0 -5
data/.rspec +0 -3

data/lib/anystyle/parser/utility.rb CHANGED

@@ -1,19 +1,19 @@
 module Anystyle
-	def self.parse(*arguments)
-		Parser::Parser.instance.parse(*arguments)
-	end
-	def self.parser
-		Parser::Parser.instance
-	end
-	module Parser
-		def self.instance
-			Parser.instance
-		end
-	end
-end
+  def self.parse(*arguments)
+    Parser::Parser.instance.parse(*arguments)
+  end
+  def self.parser
+    Parser::Parser.instance
+  end
+  module Parser
+    def self.instance
+      Parser.instance
+    end
+  end
+end

data/lib/anystyle/parser/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Anystyle
   module Parser
-    VERSION = '0.2.1'.freeze
+    VERSION = '0.3.0'.freeze
   end
 end

data/spec/anystyle/parser/parser_spec.rb CHANGED

@@ -1,120 +1,124 @@
 module Anystyle::Parser
   describe Parser do
     it { should_not be nil }
-		describe "#tokenize" do
-			it "returns [] when given an empty string" do
-				subject.tokenize('').should == []
-			end
-		  it "takes a single line and returns an array of token sequences" do
-				subject.tokenize('hello, world!').should == [%w{ hello, world! }]
-			end
-		  it "takes two lines and returns an array of token sequences" do
-				subject.tokenize("hello, world!\ngoodbye!").should == [%w{ hello, world! }, %w{ goodbye! }]
-			end
-			context "when passing a string marked as tagged" do
-				it "returns [] when given an empty string" do
-					subject.tokenize('', true).should == []
-				end
-				it "returns an array of :unknown token sequences when given an untagged single line" do
-					subject.tokenize('hello, world!', true).should == [[['hello,', :unknown], ['world!', :unknown]]]
-				end
-				it "returns an array of :unknown token sequences when given two untagged lines" do
-					subject.tokenize("hello,\nworld!", true).should == [[['hello,', :unknown]], [['world!', :unknown]]]
-				end
-				it "returns an array of token/tag pair for each line when given a single tagged string" do
-					subject.tokenize('<a>hello</a>', true).should == [[['hello', :a]]]
-				end
-				it "returns an array of token/tag pair for each line when given a string with multiple tags" do
-					subject.tokenize('<a>hello world</a> <b> !</b>', true).should == [[['hello',:a], ['world', :a], ['!', :b]]]
-				end
-				it "raises an argument error if the string contains mismatched tags" do
-					expect { subject.tokenize('<a> hello </b>', true) }.to raise_error(ArgumentError)
-					expect { subject.tokenize('<a> hello <b> world </a>', true) }.to raise_error(ArgumentError)
-				end
-			end
-		end
-		describe "#prepare" do
-			it 'returns an array of expanded token sequences' do
-				subject.prepare('hello, world!').should == [['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']]
-			end
-			context 'when marking the input as being tagged' do
-				let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
-				it 'returns an array of expaned and labelled token sequences for a tagged string' do
-					subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.should == %w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date }
-				end
-				it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
-					subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.should == %w{ unknown unknown }
-				end
-			end
-		end
-		describe "#label" do
-			let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
-			it 'returns an array of labelled segments' do
-				subject.label(citation)[0].map(&:first).should == [:author, :title, :location, :publisher, :date, :pages]
-			end
-			describe 'when passed more than one line' do
-				it 'returns two arrays' do
-					subject.label("foo\nbar").should have(2).elements
-				end
-			end
-			describe 'when passed invalid input' do
-				it 'returns an empty array for an empty string' do
-					subject.label('').should == []
-				end
-				it 'returns an empty array for an empty line' do
-					subject.label("\n").should == []
-					subject.label("\n ").should == [[],[]]
-					subject.label(" \n ").should == [[],[]]
-					subject.label(" \n").should == [[]]
-				end
-				it 'does not fail for unrecognizable input' do
-					lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
-					lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
-					pending
-					lambda { subject.label("\n doi ") }.should_not raise_error
-				end
-			end
-		end
-		describe "#parse" do
-			let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
-			it 'returns a hash of label/segment pairs by default' do
-				subject.parse(citation)[0].should == { :author => 'Perec, Georges', :title => 'A Void', :location => 'London', :publisher => 'The Harvill Press', :year => 1995, :pages => '108', :type => :book }
-			end
-			describe 'using output format "tags"' do
-				it 'returns a tagged string' do
-					subject.parse(citation, :tags)[0].should == '<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>'
-				end
-			end
-		end
+    describe "#tokenize" do
+      it "returns [] when given an empty string" do
+        subject.tokenize('').should == []
+      end
+      it "takes a single line and returns an array of token sequences" do
+        subject.tokenize('hello, world!').should == [%w{ hello, world! }]
+      end
+      it "takes two lines and returns an array of token sequences" do
+        subject.tokenize("hello, world!\ngoodbye!").should == [%w{ hello, world! }, %w{ goodbye! }]
+      end
+      context "when passing a string marked as tagged" do
+        it "returns [] when given an empty string" do
+          subject.tokenize('', true).should == []
+        end
+        it "returns an array of :unknown token sequences when given an untagged single line" do
+          subject.tokenize('hello, world!', true).should == [[['hello,', :unknown], ['world!', :unknown]]]
+        end
+        it "returns an array of :unknown token sequences when given two untagged lines" do
+          subject.tokenize("hello,\nworld!", true).should == [[['hello,', :unknown]], [['world!', :unknown]]]
+        end
+        it "returns an array of token/tag pair for each line when given a single tagged string" do
+          subject.tokenize('<a>hello</a>', true).should == [[['hello', :a]]]
+        end
+        it "returns an array of token/tag pair for each line when given a string with multiple tags" do
+          subject.tokenize('<a>hello world</a> <b> !</b>', true).should == [[['hello',:a], ['world', :a], ['!', :b]]]
+        end
+        it "raises an argument error if the string contains mismatched tags" do
+          expect { subject.tokenize('<a> hello </b>', true) }.to raise_error(ArgumentError)
+          expect { subject.tokenize('<a> hello <b> world </a>', true) }.to raise_error(ArgumentError)
+        end
+      end
+    end
+    describe "#prepare" do
+      it 'returns an array of expanded token sequences' do
+        subject.prepare('hello, world!').should == [['hello, , h he hel hell , o, lo, llo, hello other none 0 no-male no-female no-surname no-month no-place no-publisher no-journal no-editors 0 internal other none', 'world! ! w wo wor worl ! d! ld! rld! world other none 36 no-male no-female surname no-month no-place publisher no-journal no-editors 5 terminal other none']]
+      end
+      context 'when marking the input as being tagged' do
+        let(:input) { %{<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>} }
+        it 'returns an array of expaned and labelled token sequences for a tagged string' do
+          subject.prepare(input, true)[0].map { |t| t[/\S+$/] }.should == %w{ author author author author author author author author title title title title title title title editor editor editor editor editor editor editor editor editor editor editor booktitle booktitle booktitle booktitle booktitle date }
+        end
+        it 'returns an array of expanded and labelled :unknown token sequences for an untagged input' do
+          subject.prepare('hello, world!', true)[0].map { |t| t[/\S+$/] }.should == %w{ unknown unknown }
+        end
+      end
+    end
+    describe "#label" do
+      let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
+      it 'returns an array of labelled segments' do
+        subject.label(citation)[0].map(&:first).should == [:author, :title, :location, :publisher, :date, :pages]
+      end
+      describe 'when passed more than one line' do
+        it 'returns two arrays' do
+          subject.label("foo\nbar").should have(2).elements
+        end
+      end
+      describe 'when passed invalid input' do
+        it 'returns an empty array for an empty string' do
+          subject.label('').should == []
+        end
+        it 'returns an empty array for an empty line' do
+          subject.label("\n").should == []
+          subject.label("\n ").should == [[],[]]
+          subject.label(" \n ").should == [[],[]]
+          subject.label(" \n").should == [[]]
+        end
+        it 'does not fail for unrecognizable input' do
+          lambda { subject.label("@misc{70213094902020,\n") }.should_not raise_error
+          lambda { subject.label("doi = {DOI:10.1503/jpn.100140}\n}\n") }.should_not raise_error
+          pending
+          lambda { subject.label("\n doi ") }.should_not raise_error
+        end
+      end
+    end
+    describe "#parse" do
+      let(:citation) { 'Perec, Georges. A Void. London: The Harvill Press, 1995. p.108.' }
+      it 'returns a hash of label/segment pairs by default' do
+        subject.parse(citation)[0].should == { :author => 'Perec, Georges', :title => 'A Void', :location => 'London', :publisher => 'The Harvill Press', :year => 1995, :pages => '108', :type => :book }
+      end
+      describe 'using output format "tags"' do
+        it 'returns a tagged string' do
+          subject.parse(citation, :tags)[0].should == '<author>Perec, Georges.</author> <title>A Void.</title> <location>London:</location> <publisher>The Harvill Press,</publisher> <date>1995.</date> <pages>p.108.</pages>'
+        end
+      end
+      it 'returns the label/token arrays for format "raw"' do
+        subject.parse(citation, :raw)[0][0].should == [:author, 'Perec,']
+      end
+    end
   end
 end

data/spec/spec_helper.rb CHANGED

@@ -1,17 +1,24 @@
 begin
   require 'simplecov'
+  require 'coveralls' if ENV['CI']
 rescue LoadError
   # ignore
 end
 begin
-  require 'debugger'
+  case
+  when defined?(RUBY_ENGINE) && RUBY_ENGINE == 'rbx'
+    require 'rubinius/debugger'
+  else
+    require 'debugger'
+  end
 rescue LoadError
   # ignore
 end
 $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 $LOAD_PATH.unshift(File.dirname(__FILE__))
 require 'rspec'
 require 'anystyle/parser'
@@ -20,5 +27,5 @@ require 'anystyle/parser'
 Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
 RSpec.configure do |config|
 end

metadata CHANGED

@@ -1,66 +1,59 @@
 --- !ruby/object:Gem::Specification
 name: anystyle-parser
 version: !ruby/object:Gem::Version
-  version: 0.2.1
-  prerelease:
+  version: 0.3.0
 platform: ruby
 authors:
 - Sylvester Keil
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-08 00:00:00.000000000 Z
+date: 2014-02-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bibtex-ruby
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: '3.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: '3.0'
 - !ruby/object:Gem::Dependency
   name: wapiti
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0.0'
 - !ruby/object:Gem::Dependency
   name: namae
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.7.1
+        version: '0.8'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.7.1
-description: A sophisticated parser for academic references based on machine learning
-  algorithms using conditional random fields.
+        version: '0.8'
+description: A sophisticated parser for academic reference lists and bibliographies
+  based on machine learning algorithms using conditional random fields.
 email:
 - http://sylvester.keil.or.at
 executables: []
@@ -69,9 +62,6 @@ extra_rdoc_files:
 - README.md
 - LICENSE
 files:
-- .autotest
-- .gitignore
-- .rspec
 - Gemfile
 - HISTORY.md
 - LICENSE
@@ -102,40 +92,33 @@ files:
 homepage: http://github.com/inukshuk/anystyle-parser
 licenses:
 - FreeBSD
+metadata: {}
 post_install_message:
 rdoc_options:
-- --line-numbers
-- --inline-source
-- --title
-- ! '"Anystyle Parser"'
-- --main
+- "--line-numbers"
+- "--inline-source"
+- "--title"
+- "\"Anystyle Parser\""
+- "--main"
 - README.md
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
-      segments:
-      - 0
-      hash: -2142174744936810203
+      version: 1.9.3
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: -2142174744936810203
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 2.2.1
 signing_key:
-specification_version: 3
-summary: Parser for academic references.
+specification_version: 4
+summary: Smart and fast academic bibliography parser.
 test_files:
 - features/step_definitions/parser_steps.rb
 - features/support/env.rb