RubyGems - anystyle-parser - Versions diffs - 0.0.1 - Mend

anystyle-parser 0.0.1

Files changed (29) hide show

data/.autotest +0 -0
data/.gitignore +5 -0
data/.rspec +3 -0
data/Gemfile +21 -0
data/HISTORY.md +3 -0
data/LICENSE +26 -0
data/README.md +152 -0
data/anystyle-parser.gemspec +37 -0
data/features/step_definitions/parser_steps.rb +0 -0
data/features/support/env.rb +1 -0
data/lib/anystyle/parser/dictionary.rb +165 -0
data/lib/anystyle/parser/errors.rb +19 -0
data/lib/anystyle/parser/features.rb +164 -0
data/lib/anystyle/parser/normalizer.rb +322 -0
data/lib/anystyle/parser/parser.rb +240 -0
data/lib/anystyle/parser/support/anystyle.mod +7891 -0
data/lib/anystyle/parser/support/anystyle.pat +72 -0
data/lib/anystyle/parser/support/dict.txt.gz +0 -0
data/lib/anystyle/parser/utility.rb +19 -0
data/lib/anystyle/parser/version.rb +5 -0
data/lib/anystyle/parser.rb +17 -0
data/spec/anystyle/parser/dictionary_spec.rb +31 -0
data/spec/anystyle/parser/features_spec.rb +24 -0
data/spec/anystyle/parser/normalizer_spec.rb +36 -0
data/spec/anystyle/parser/parser_spec.rb +85 -0
data/spec/benchmark.rb +74 -0
data/spec/profile.rb +34 -0
data/spec/spec_helper.rb +1 -0
metadata +169 -0

data/.autotest ADDED Viewed

File without changes

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+./doc/
+lib/anystyle/parser/support/dict.kch
+lib/anystyle/parser/support/dict.dbm
+Gemfile.lock
+*.rbc

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--color
+--require spec_helper
+--format progress

data/Gemfile ADDED Viewed

@@ -0,0 +1,21 @@
+source :rubygems
+gemspec
+group :debug do
+	gem 'ruby-debug19', :require => 'ruby-debug', :platforms => [:mri_19]
+	gem 'ruby-debug', :platforms => [:mri_18, :jruby]
+	gem 'rbx-trepanning', :platforms => [:rbx]
+end
+group :osx_test do
+	gem 'autotest-fsevent', :require => false
+end
+group :profile do
+	gem 'ruby-prof'
+	gem 'gnuplot'
+end
+group :kyotocabinet do
+	gem 'kyotocabinet-ruby', :require => 'kyotocabinet'
+end

data/HISTORY.md ADDED Viewed

@@ -0,0 +1,3 @@
+0.0.1 / 2011-09-05
+------------------
+* Initial release

data/LICENSE ADDED Viewed

@@ -0,0 +1,26 @@
+Copyright 2011 Sylvester Keil. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+The views and conclusions contained in the software and documentation are
+those of the authors and should not be interpreted as representing official
+policies, either expressed or implied, of the copyright holder.

data/README.md ADDED Viewed

@@ -0,0 +1,152 @@
+Anystyle-Parser
+===============
+Anystyle-Parser is a very fast and smart parser for academic references. It
+is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
+[FreeCite](http://freecite.library.brown.edu/); Anystyle-Parser is designed
+for raw speed (it uses [wapiti](https://github.com/inukshuk/wapiti-ruby) based
+conditional random fields and [Kyoto Cabinet](http://fallabs.com/kyotocabinet/)
+as a key-value store), flexibility (it is easy to train the model with
+data that is relevant to your parsing needs), and compatibility (Anystyle-Parser
+exports to Ruby Hashes, BibTeX, or the CiteProc JSON format).
+Installation
+------------
+    $ [sudo] gem install anystyle-parser
+During the statistical analysis of reference strings, Anystyle-Parser relies
+on a large feature dictionary; by default, Anystyle-Parser creates a
+[Kyoto Cabinet](http://fallabs.com/kyotocabinet/) file-based hash database
+from the dictionary file that ships with the parser. If Kyoto Cabinet is
+not installed on your system, Anystyle-Parser uses a simple Ruby Hash as a
+fall-back; this Hash has to be re-created every time you load the parser
+and takes up a lot of memory in your Ruby process; it is therefore strongly
+recommended to install Kyoto Cabinet and the `kyotocabinet-ruby` gem.
+    $ [sudo] gem install kyotocabinet-ruby
+The database file will be created the first time you access the dictionary;
+note that you will need write permissions in the directory where the file
+is to be created. You can change the Dictionary's default path in the
+Dictrionary's options:
+    Anystyle::Parser::Dictionary.instance.options[:path]
+Usage
+-----
+### Parsing
+You can access the main Anystyle-Parser instance at `Anystyle.parser`;
+the `#parse` method is also available via `Anystyle.parse`. For more complex
+requirements (e.g., if you need multiple Parser simultaneously) you can create
+your own instances from the `Anystyle::Parser::Parser` class.
+The two fundamental methods you need to know about in order to use
+Anystyle-Parser are `#parse` and `#train` that both accept two arguments.
+    Parser#parse(input, format = :hash)
+    Parser#train(input, truncate = false)
+`#parse` parses the passed-in input (either a filename, your reference strings,
+or an array of your reference strings) and returns the parsed data in the
+format specified as the second argument (supported formats include: *:hash*,
+*:bibtex*, and *:citeproc*).
+`#train` allows you to easily train the Parser's CRF model. The first argument
+is either a filename or your data as a string; the format of training data
+follows the XML-like syntax of the
+[CORA dataset](http://www.cs.umass.edu/~mccallum/data/cora-ie.tar.gz); the
+optional boolean argument lets you decide whether to train the existing
+model or to create an entirely new one.
+The following irb sessions illustrates some parser goodness:
+    > require 'anystyle/parser'
+    > Anystyle.parse 'Poe, Edgar A. Essays and Reviews. New York: Library of America, 1984.'
+    => [{:author=>"Poe, Edgar A.", :title=>"Essays and Reviews", :location=>"New York", :publisher=>"Library of America", :year=>1984, :type=>:book}]
+    > b = Anystyle.parse 'Dong C. Liu and Jorge Nocedal. 1989. On the limited memory BFGS method for large scale optimization. Mathematical Programming, 45:503–528.', :bibtex
+    > b[0].author[1].given
+    => "Jorge"
+    > b[0].author.to_s
+    => "Liu, Dong C. and Nocedal, Jorge"
+### Unhappy with the results?
+Citation references come in many forms, so, inevitably, you will find data
+where Anystyle-Parser does not produce satisfying parsing results.
+    > Anystyle.parse 'John Lafferty, Andrew McCallum, and Fernando Pereira. 2001. Conditional random fields: probabilistic models for segmenting and labeling sequence data. In Proceedings of the International Conference on Machine Learning, pages 282-289. Morgan Kaufmann, San Francisco, CA.'
+    => [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira. 2001", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :type=>:inproceedings}]
+This result is not bad, but notice how the year was not picked up as a date
+but interpreted as part of the author name. If you have such a problem
+(particularly, if the problem applies to a range of your input data, e.g.,
+data that follows a style that Anystyle-Parser was not trained to recognize),
+you can teach Anystyle-Parser to recognize your format. The easiest way to
+go about this is to create new file (e.g., 'training.txt'), copy and paste a
+few references, and tag them for training. For example, a tagged version of
+the input from the example above would look like this:
+    <author> John Lafferty, Andrew McCallum, and Fernando Pereira. </author> <date> 2001. </date> <title> Conditional random fields: probabilistic models for segmenting and labeling sequence data. </title> <booktitle> In Proceedings of the International Conference on Machine Learning, </booktitle> <pages> pages 282–289. </pages> <publisher> Morgan Kaufmann, </publisher> <location> San Francisco, CA. </location>
+Note that you can pick any tag names, but when working with Anystyle's model
+you should use the same names used to to train the model. You can always ask
+the Parser's model what names (labels) it knows about:
+    > Anystyle.parser.model.labels
+    => ["author", "booktitle", "container", "date", "edition", "editor", "institution", "journal", "location", "note", "pages", "publisher", "tech", "title", "unknown", "volume"]
+Once you have tagged a few references that you want Anystyle-Parser to learn,
+you can train the model as follows:
+    > Anystyle.parser.train 'training.txt', false
+By passing `true` as the second argument, you will discard Anystyle's default
+model; the resulting model will be based entirely on your own data. By default
+the new or altered model will not be saved, but you can do so at any time
+by calling `Anystyle.parser.model.save` to save the model to the default file.
+If you want to save the model to a different file, set the
+`Anystyle.parser.model.path` attribute accordingly.
+After teaching Anystyle-Parser with the tagged references, try to parse your
+data again:
+    > Anystyle.parse 'John Lafferty, Andrew McCallum, and Fernando Pereira. 2001. Conditional random fields: probabilistic models for segmenting and labeling sequence data. In Proceedings of the International Conference on Machine Learning, pages 282-289. Morgan Kaufmann, San Francisco, CA.'
+    => [{:author=>"John Lafferty and Andrew McCallum and Fernando Pereira", :title=>"Conditional random fields: probabilistic models for segmenting and labeling sequence data", :booktitle=>"Proceedings of the International Conference on Machine Learning", :pages=>"282--289", :publisher=>"Morgan Kaufmann", :location=>"San Francisco, CA", :year=>2001, :type=>:inproceedings}]
+Contributing
+------------
+The Anystyle-Parser source code is
+[hosted on GitHub](http://github.com/inukshuk/anystyle-parser/).
+You can check out a copy of the latest code using Git:
+    $ git clone https://github.com/inukshuk/anystyle-parser.git
+If you've found a bug or have a question, please open an issue on the
+[Anystyle-Parser issue tracker](http://github.com/inukshuk/anystyle-parser/issues).
+Or, for extra credit, clone the Anystyle-Parser repository, write a failing
+example, fix the bug and submit a pull request.
+License
+-------
+Copyright 2011 Sylvester Keil. All rights reserved.
+Some of the code in Anystyle-Parser's post processing (normalizing) routines
+was originally based on the source code of FreeCite and
+Copyright 2008 Public Display Inc.
+The CRF template is a modified version of ParsCit's original template
+Copyright 2008, 2009, 2010, 2011 Min-Yen Kan,
+Isaac G. Councill, C. Lee Giles, Minh-Thang Luong and Huy Nhat Hoang
+Do.
+Anystyle-Parser is distributed under a BSD-style license. See LICENSE for details.

data/anystyle-parser.gemspec ADDED Viewed

@@ -0,0 +1,37 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib/', __FILE__)
+$:.unshift lib unless $:.include?(lib)
+require 'anystyle/parser/version'
+Gem::Specification.new do |s|
+  s.name        = 'anystyle-parser'
+  s.version     = Anystyle::Parser::VERSION.dup
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ['Sylvester Keil']
+  s.email       = ['http://sylvester.keil.or.at']
+  s.homepage    = 'http://inukshuk.github.com/anystyle-parser'
+  s.summary     = 'Parser for academic references.'
+  s.description = 'A sophisticated parser for academic references based on conditional random fields.'
+  s.license     = 'FreeBSD'
+  s.add_runtime_dependency('bibtex-ruby', '~>1.3')
+  s.add_runtime_dependency('wapiti', '~>0.0')
+  s.add_development_dependency('rake', ['~>0.9'])
+  s.add_development_dependency('racc', ['~>1.4'])
+  s.add_development_dependency('cucumber', ['~>1.0'])
+  s.add_development_dependency('rspec', ['~>2.6'])
+  s.add_development_dependency('ZenTest', ['~>4.6'])
+  s.files        = `git ls-files`.split("\n") - Dir['resources/**/*']
+  s.test_files   = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables  = []
+  s.require_path = 'lib'
+  s.rdoc_options      = %w{--line-numbers --inline-source --title "Anystyle\ Parser" --main README.md}
+  s.extra_rdoc_files  = %w{README.md LICENSE}
+end
+# vim: syntax=ruby

data/features/step_definitions/parser_steps.rb ADDED Viewed

File without changes

data/features/support/env.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'anystyle/parser'

data/lib/anystyle/parser/dictionary.rb ADDED Viewed

@@ -0,0 +1,165 @@
+module Anystyle
+	module Parser
+		# Dictionary is a Singleton object that provides a key-value store of
+		# the Anystyle Parser dictionary required for feature elicitation.
+		# This dictionary acts essentially like a Ruby Hash object, but because
+		# of the dictionary's size it is not efficient to keep the entire
+		# dictionary in memory at all times. For that reason, Dictionary
+		# creates a persistent data store on disk using Kyoto Cabinet; if
+		# Kyoto Cabinet is not installed a Ruby Hash is used as a fall-back.
+		#
+		# The database will be automatically created from the dictionary file
+		# using the best available DBM the first time it is accessed. Once
+		# database file exists, the database will be restored from file.
+		# Therefore, if you make changes to the dictionary file, you will have
+		# to delete the old database file for a new one to be created.
+		#
+		# Database creation requires write permissions. By default, the database
+		# will be created in the support directory of the Parser; if you have
+		# installed the gem version of the Parser, you may not have write
+		# permissions, but you can change the path in the Dictionary's options.
+		#
+		#     Dictionary.instance.options[:path] # => the database file
+		#     Dictionary.instance.options[:source] # => the (zipped) dictionary file
+		#
+		class Dictionary
+			include Singleton
+			@defaults = {
+				:source => File.expand_path('../support/dict.txt.gz', __FILE__),
+				:path => File.expand_path('../support/dict.kch', __FILE__)
+			}.freeze
+			@keys = [:male, :female, :surname, :month, :place, :publisher, :journal].freeze
+			@code = Hash[*@keys.zip(0.upto(@keys.length-1).map { |i| 2**i }).flatten]
+			@code.default = 0
+			@code.freeze
+			@mode = begin
+				require 'kyotocabinet'
+				:kyoto
+			rescue LoadError
+				:hash
+			end
+			class << self
+				attr_reader :keys, :code, :defaults, :mode
+			end
+			attr_reader :options
+			def initialize
+				@options = Dictionary.defaults.dup
+			end
+			def [](key)
+				db[key.to_s].to_i
+			end
+			def []=(key, value)
+				db[key.to_s] = value
+			end
+			def create
+				case Dictionary.mode
+				when :kyoto
+					truncate
+					@db = KyotoCabinet::DB.new
+					unless @db.open(path, KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE)
+						raise DatabaseError, "failed to create cabinet file #{path}: #{@db.error}"
+					end
+					populate
+					close
+				else
+					# nothing
+				end
+			end
+			def truncate
+				close
+				File.unlink(path) if File.exists?(path)
+			end
+			def open
+				create unless File.exists?(path)
+				case Dictionary.mode
+				when :kyoto
+					at_exit { ::Anystyle::Parser::Dictionary.instance.close }
+					@db = KyotoCabinet::DB.new
+					unless @db.open(path, KyotoCabinet::DB::OREADER)
+						raise DictionaryError, "failed to open cabinet file #{path}: #{@db.error}"
+					end
+				else
+					@db = Hash.new(0)
+					populate
+				end
+				@db
+			end
+			def open?; !!@db; end
+			def close
+				@db.close if @db.respond_to?(:close)
+				@db = nil
+			end
+			def path
+				options[:path]
+			end
+			private
+			def db
+				@db || open
+			end
+			def populate
+				require 'zlib'
+				File.open(options[:source], 'r:UTF-8') do |f|
+					mode = 0
+					Zlib::GzipReader.new(f).each do |line|
+						line.strip!
+						if line.start_with?('#')
+							case line
+							when /^## male/i
+								mode = Dictionary.code[:male]
+			        when /^## female/i
+			          mode = Dictionary.code[:female]
+			        when /^## (?:surname|last|chinese)/i
+			          mode = Dictionary.code[:surname]
+			        when /^## months/i
+			          mode = Dictionary.code[:month]
+			        when /^## place/i
+			          mode = Dictionary.code[:place]
+			        when /^## publisher/i
+			          mode = Dictionary.code[:publisher]
+			        when /^## journal/i
+			          mode = Dictionary.code[:journal]
+							else
+								# skip comments
+							end
+						else
+							key, probability = line.split(/\s+(\d+\.\d+)\s*$/)
+							value = self[key]
+							self[key] = value + mode if value < mode
+						end
+					end
+				end
+			end
+		end
+	end
+end

data/lib/anystyle/parser/errors.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module Anystyle
+	module Parser
+		class Error < StandardError
+			attr_accessor :original
+			def initialize(message = nil, original = $!)
+				super(message)
+				@original = original
+			end
+		end
+		class DictionaryError < Error; end
+		class TrainingError < Error; end
+	end
+end

data/lib/anystyle/parser/features.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# -*- encoding: utf-8 -*-
+module Anystyle
+	module Parser
+		class Feature
+			@dict = Dictionary.instance
+			@instances = []
+			class << self
+				attr_reader :dict, :instances
+				def define(name, &block)
+					instances << new(name, block)
+				end
+				def undefine(name)
+					instances.reject! { |f| f.name == name }
+				end
+			end
+			attr_accessor :name, :matcher
+			def initialize(name, matcher)
+				@name, @matcher = name, matcher
+			end
+			def match(*arguments)
+				matcher.call(*arguments)
+			end
+		end
+		# Is the the last character upper-/lowercase, numeric or something else?
+		# Returns A, a, 0 or the last character itself.
+		Feature.define :last_character do |token, stripped, sequence, offset|
+			case char = token.split(//)[-1]
+			when /^[[:upper:]]$/
+				:upper
+			when /^[[:lower:]]$/
+				:lower
+			when /^\d$/
+				:numeric
+			else
+				char
+			end
+		end
+		# Sequences of the first four characters
+		Feature.define :first do |token, stripped, sequence, offset|
+			c = token.split(//)[0,4]
+			(0..3).map { |i| c[0..i].join }
+		end
+		# Sequences of the last four characters
+		Feature.define :last do |token, stripped, sequence, offset|
+			c = token.split(//).reverse[0,4]
+			(0..3).map { |i| c[0..i].reverse.join }
+		end
+		Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
+			stripped.empty? ? :EMPTY : stripped.downcase
+		end
+		Feature.define :capitalization do |token, stripped, sequence, offset|
+			case stripped
+			when /^[[:upper:]]$/
+				:single
+			when /^[[:upper:]][[:lower:]]/
+				:initial
+			when /^[[:upper:]]+$/
+				:all
+			else
+				:other
+			end
+		end
+		Feature.define :numbers do |token, stripped, sequence, offset|
+			case token
+			when /\d\(\d+(-\d+)?\)/
+				:volume
+			when /^\(\d{4}\)\W*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
+				:year
+			when /\d+\s*--?\s*\d+/, /^\W*pp?\.\d*\W*$/
+				:page
+			when /^\d$/
+				:single
+			when /^\d{2}$/
+				:double
+			when /^\d{3}$/
+				:triple
+			when /^\d+$/
+				:digits
+			when /\d+(th|st|nd|rd)\W*/i
+				:ordinal
+			when /\d/
+				:numeric
+			else
+				:none
+			end
+		end
+		Feature.define :dictionary do |token, stripped, sequence, offset|
+			c = Feature.dict[stripped.downcase]
+			f = Dictionary.keys.map do |k|
+				c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
+			end
+			f.unshift(c)
+		end
+		# TODO sequence features should be called just once per sequence
+		# TODO improve / disambiguate edition
+		Feature.define :editors do |token, stripped, sequence, offest|
+			sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited)$/i } ? :editors : :'no-editors'
+		end
+		# TODO Translated
+		Feature.define :location do |token, stripped, sequence, offset|
+			((offset.to_f / sequence.length) * 10).round
+		end
+		Feature.define :punctuation do |token, stripped, sequence, offset|
+			case token
+			when /^["'”’´‘“`]/
+				:quote
+			when /["'”’´‘“`]$/
+				:unquote
+			when /-+/
+				:hyphen
+			when /[,;:-]$/
+				:internal
+			when /[!\?\."']$/
+				:terminal
+			when /^[\(\[\{<].*[>\}\]\)].?$/
+				:braces
+			when /^\d{2,5}\(\d{2,5}\).?$/
+				:volume
+			else
+				:others
+			end
+		end
+		Feature.define :type do |token, stripped, sequence, offset|
+			s = sequence.join(' ')
+			case
+			when s =~ /dissertation abstract/i
+				:dissertaion
+			when s =~ /proceeding/i
+				:proceedings
+			when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
+				:collection
+			else
+				:other
+			end
+		end
+	end
+end