RubyGems - anystyle-parser - Versions diffs - 0.0.1 - Mend

anystyle-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/.autotest +0 -0
data/.gitignore +5 -0
data/.rspec +3 -0
data/Gemfile +21 -0
data/HISTORY.md +3 -0
data/LICENSE +26 -0
data/README.md +152 -0
data/anystyle-parser.gemspec +37 -0
data/features/step_definitions/parser_steps.rb +0 -0
data/features/support/env.rb +1 -0
data/lib/anystyle/parser/dictionary.rb +165 -0
data/lib/anystyle/parser/errors.rb +19 -0
data/lib/anystyle/parser/features.rb +164 -0
data/lib/anystyle/parser/normalizer.rb +322 -0
data/lib/anystyle/parser/parser.rb +240 -0
data/lib/anystyle/parser/support/anystyle.mod +7891 -0
data/lib/anystyle/parser/support/anystyle.pat +72 -0
data/lib/anystyle/parser/support/dict.txt.gz +0 -0
data/lib/anystyle/parser/utility.rb +19 -0
data/lib/anystyle/parser/version.rb +5 -0
data/lib/anystyle/parser.rb +17 -0
data/spec/anystyle/parser/dictionary_spec.rb +31 -0
data/spec/anystyle/parser/features_spec.rb +24 -0
data/spec/anystyle/parser/normalizer_spec.rb +36 -0
data/spec/anystyle/parser/parser_spec.rb +85 -0
data/spec/benchmark.rb +74 -0
data/spec/profile.rb +34 -0
data/spec/spec_helper.rb +1 -0
metadata +169 -0

data/lib/anystyle/parser/normalizer.rb ADDED Viewed

@@ -0,0 +1,322 @@
+# -*- encoding: utf-8 -*-
+module Anystyle
+	module Parser
+		class Normalizer
+			include Singleton
+			MONTH = Hash.new do |h,k|
+				case k
+				when /jan/i
+					h[k] = 1
+				when /feb/i
+					h[k] = 2
+				when /mar/i
+					h[k] = 3
+				when /apr/i
+					h[k] = 4
+				when /ma[yi]/i
+					h[k] = 5
+				when /jun/i
+					h[k] = 6
+				when /jul/i
+					h[k] = 7
+				when /aug/i
+					h[k] = 8
+				when /sep/i
+					h[k] = 9
+				when /o[ck]t/i
+					h[k] = 10
+				when /nov/i
+					h[k] = 11
+				when /dec/i
+					h[k] = 12
+				else
+					h[k] = nil
+				end
+			end
+			def method_missing(name, *arguments, &block)
+				case name.to_s
+				when /^normalize_(.+)$/
+					normalize($1.to_sym, *arguments, &block)
+				else
+					super
+				end
+			end
+			# Default normalizer. Strips punctuation.
+			def normalize(key, hash)
+				token, *dangling =  hash[key]
+				unmatched(key, hash, dangling) unless dangling.empty?
+				token.gsub!(/^\W+|\W+$/, '')
+				hash[key] = token
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def normalize_author(hash)
+				authors, *dangling = hash[:author]
+				unmatched(:author, hash, dangling) unless dangling.empty?
+				if authors =~ /\W*[Ee]d(s|itors)?\W*$/ && !hash.has_key?(:editor)
+					hash[:editor] = hash.delete(:author)
+					normalize_editor(hash)
+				else
+		      hash['more-authors'] = true if !!authors.sub!(/\bet\.?\s*al.*$/i, '')
+					authors.gsub!(/^\W+|\W+$/, '')
+					hash[:author] = normalize_names(authors)
+				end
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+	    def normalize_editor(hash)
+	      editors, edition = hash[:editor]
+				unless edition.nil?
+					if edition =~ /(\d+)/
+						hash[:edition] = $1.to_i
+					end
+				end
+	      hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '')
+				editors.gsub!(/^\W+|\W+$/, '')
+				editors.gsub!(/^in\s+/i, '')
+				editors.gsub!(/\W*[Ee]d(s|itors|ited)?\W*?/, '')
+				editors.gsub!(/\bby\b/i, '')
+				is_trans if !!translators.gsub!(/\W*trans(lated)?\W*/i, '')
+      	hash[:editor] = normalize_names(editors)
+				hash[:translator] = hash[:editor] if is_trans
+	      hash
+			rescue => e
+				warn e.message
+				hash
+	    end
+			def normalize_translator(hash)
+				translators = hash[:translator]
+				translators.gsub!(/^\W+|\W+$/, '')
+				translators.gsub!(/\W*trans(lated)?\W*/i, '')
+				translators.gsub!(/\bby\b/i, '')
+				hash[:translator] = normalize_names(translators)
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def normalize_names(names)
+				names = tokenize_names(names).map do |name|
+					name.strip!
+					name.gsub!(/\b([[:upper:]])(\W|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
+					name
+				end
+				names.join(' and ')
+			rescue => e
+				warn e.message
+				hash
+			end
+			def tokenize_names(names)
+				s, n, ns, cc = StringScanner.new(names), '', [], 0
+				until s.eos?
+					case
+					when s.scan(/,?\s*and\b|&/)
+						ns << n
+						n, cc = '', 0
+					when s.scan(/\s+/)
+						n << ' '
+					when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
+						n << s.matched
+					when s.scan(/,/)
+						if cc > 0 || n =~ /\w\w+\s+\w\w+/
+							ns << n
+							n, cc = '', 0
+						else
+							n << s.matched
+							cc += 1
+						end
+					when s.scan(/\w+/), s.scan(/./)
+						n << s.matched
+					end
+				end
+				ns << n
+			end
+			def normalize_title(hash)
+				title, container = hash[:title]
+				unless container.nil?
+					hash[:container] = container
+					normalize(:container, hash)
+				end
+				extract_edition(title, hash)
+				title.gsub!(/[\.,:;\s]+$/, '')
+				title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '')
+				hash[:title] = title
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def extract_edition(token, hash)
+				edition = [hash[:edition]].flatten.compact
+				if token.gsub!(/\W*(\d+)(?:st|nd|rd|th)?\s*ed(?:ition|\.)?\W*/i, '')
+					edition << $1
+				end
+				if token.gsub!(/(?:\band)?\W*([Ee]xpanded)\W*$/, '')
+					edition << $1
+				end
+				if token.gsub!(/(?:\band)?\W*([Ii]llustrated)\W*$/, '')
+					edition << $1
+				end
+				if token.gsub!(/(?:\band)?\W*([Rr]evised)\W*$/, '')
+					edition << $1
+				end
+				if token.gsub!(/(?:\band)?\W*([Rr]eprint)\W*$/, '')
+					edition << $1
+				end
+				hash[:edition] = edition.join(', ') unless edition.empty?
+			end
+			def normalize_booktitle(hash)
+				booktitle, *dangling = hash[:booktitle]
+				unmatched(:booktitle, hash, dangling) unless dangling.empty?
+				booktitle.gsub!(/^in\s*/i, '')
+				extract_edition(booktitle, hash)
+				booktitle.gsub!(/[\.,:;\s]+$/, '')
+				hash[:booktitle] = booktitle
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def normalize_container(hash)
+				container, *dangling = hash[:container]
+				unmatched(:container, hash, dangling) unless dangling.empty?
+				case container
+				when /dissertation abstracts/i
+					container.gsub!(/\s*section \w: ([\w\s]+).*$/i, '')
+					hash[:category] = $1 unless $1.nil?
+					hash[:type] = :phdthesis
+				end
+				hash[:container] = container
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def normalize_date(hash)
+				date, *dangling = hash[:date]
+				unmatched(:date, hash, dangling) unless dangling.empty?
+				unless (month = MONTH[date]).nil?
+					hash[:month] = month
+				end
+				if date =~ /(\d{4})/
+					hash[:year] = $1.to_i
+					hash.delete(:date)
+				end
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def normalize_volume(hash)
+				volume, *dangling = hash[:volume]
+				unmatched(:volume, hash, dangling) unless dangling.empty?
+				case volume
+				when /\D*(\d+)\D+(\d+[\s&-]+\d+)/
+					hash[:volume], hash[:number] = $1.to_i, $2
+				when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
+					hash[:volume] = $1.to_i unless $1.nil?
+					hash[:number] = $2
+				when /(\d+)?\D+no\.\s*(\d+)/
+					hash[:volume] = $1.to_i unless $1.nil?
+					hash[:number] = $2.to_i
+				when /\D*(\d+)\D+(\d+)/
+					hash[:volume], hash[:number] = $1.to_i, $2.to_i
+				when /(\d+)/
+					hash[:volume] = $1.to_i
+				end
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			def normalize_pages(hash)
+				pages, *dangling = hash[:pages]
+				unmatched(:pages, hash, dangling) unless dangling.empty?
+				# "volume.issue(year):pp"
+				case pages
+				when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
+					hash[:volume] = $1.to_i
+					hash[:number] = $2.to_i unless $2.nil?
+					hash[:year] = $3.to_i unless $3.nil?
+					hash[:pages] = $4
+				end
+				case hash[:pages]
+				when /(\d+)\D+(\d+)/
+					hash[:pages] = [$1,$2].join('--')
+				when  /(\d+)/
+					hash[:pages] = $1
+				end
+				hash
+			rescue => e
+				warn e.message
+				hash
+			end
+			private
+			def unmatched(label, hash, tokens)
+				hash["unmatched-#{label}"] = tokens.join(' ')
+			end
+		end
+	end
+end

data/lib/anystyle/parser/parser.rb ADDED Viewed

@@ -0,0 +1,240 @@
+module Anystyle
+	module Parser
+		class Parser
+			@models = Hash.new { |h,k| k }.merge(
+				:anystyle => File.expand_path('../support/anystyle.mod', __FILE__),
+				:cora => File.expand_path('../support/cora.mod', __FILE__)
+			)
+			@formats = [:bibtex, :hash, :citeproc].freeze
+			@defaults = {
+				:model => :anystyle,
+				:pattern => File.expand_path('../support/anystyle.pat', __FILE__),
+				:separator => /\s+/,
+				:tagged_separator => /\s+|(<\/?[^>]+>)/,
+				:strip => /\W/,
+				:format => :hash
+			}.freeze
+			@features = Feature.instances
+			@feature = Hash.new { |h,k| h[k.to_sym] = features.detect { |f| f.name == k.to_sym } }
+			class << self
+				attr_reader :defaults, :features, :feature, :models, :formats
+				def load(path)
+					p = new
+					p.model = Wapiti.load(path)
+					p
+				end
+				# Returns a default parser instance
+				def instance
+					@instance ||= new
+				end
+			end
+			attr_reader :options
+			attr_accessor :model, :normalizer
+			def initialize(options = {})
+				@options = Parser.defaults.merge(options)
+				@model = Wapiti.load(Parser.models[@options[:model]])
+				@normalizer = Normalizer.instance
+			end
+			def parse(input, format = options[:format])
+				formatter = "format_#{format}".to_sym
+				send(formatter, label(input))
+			rescue NoMethodError
+				raise ArgumentError, "format not supported: #{formatter}"
+			end
+			# Returns an array of label/segment pairs for each line in the passed-in string.
+			def label(input, labelled = false)
+				string = input_to_s(input)
+				model.label(prepare(string, labelled)).map! do |sequence|
+					sequence.inject([]) do |ts, (token, label)|
+						token, label = token[/^\S+/], label.to_sym
+						if (prev = ts[-1]) && prev[0] == label
+							prev[1] << ' ' << token
+							ts
+						else
+							ts << [label, token]
+						end
+					end
+				end
+			end
+			# Returns an array of tokens for each line of input.
+			#
+			# If the passed-in string is marked as being tagged, extracts labels
+			# from the string and returns an array of token/label pairs for each
+			# line of input.
+			def tokenize(string, tagged = false)
+				if tagged
+					string.split(/[\n\r]+/).each_with_index.map do |s,i|
+						tt, tokens, tags = s.split(options[:tagged_separator]), [], []
+						tt.each do |token|
+							case token
+							when /^$/
+								# skip
+							when /^<([^\/>][^>]*)>$/
+								tags << $1
+							when /^<\/([^>]+)>$/
+								unless (tag = tags.pop) == $1
+									raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
+								end
+							else
+								tokens << [token, (tags[-1] || :unknown).to_sym]
+							end
+						end
+						tokens
+					end
+				else
+					string.split(/[\n\r]+/).map { |s| s.split(options[:separator]) }
+				end
+			end
+			# Prepares the passed-in string for processing by a CRF tagger. The
+			# string is split into separate lines; each line is tokenized and
+			# expanded. Returns an array of sequence arrays that can be labelled
+			# by the CRF model.
+			#
+			# If the string is marked as being tagged by passing +true+ as the
+			# second argument, training labels will be extracted from the string
+			# and appended after feature expansion. The returned sequence arrays
+			# can be used for training or testing the CRF model.
+			def prepare(input, tagged = false)
+				string = input_to_s(input)
+				tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
+			end
+			# Expands the passed-in token string by appending a space separated list
+			# of all features for the token.
+			def expand(token, sequence = [], offset = 0, label = nil)
+				f = features_for(token, strip(token), sequence, offset)
+				f.unshift(token)
+				f.push(label) unless label.nil?
+				f.join(' ')
+			end
+			def train(input, truncate = false)
+				string = input_to_s(input)
+				@model = Wapiti::Model.new(:pattern => options[:pattern]) if truncate
+				@model.train(prepare(string, true))
+				@model.compact
+				@model.path = Parser.models[options[:model]]
+				@model
+			end
+			def test(input)
+				string = input_to_s(input)
+				model.options.check!
+				model.label(prepare(string, true))
+			end
+			def normalize(hash)
+				hash.keys.each do |label|
+					normalizer.send("normalize_#{label}", hash)
+				end
+				classify hash
+			end
+			def classify(hash)
+				return hash if hash.has_key?(:type)
+				keys = hash.keys
+				text = hash.values.flatten.join
+				case
+				when keys.include?(:journal)
+					hash[:type] = :article
+				when text =~ /proceedings/i
+					hash[:type] = :inproceedings
+				when keys.include?(:booktitle), keys.include?(:container)
+					hash[:type] = :incollection
+				when keys.include?(:publisher)
+					hash[:type] = :book
+				when keys.include?(:institution)
+					hash[:type] = :techreport
+				when keys.include?(:school)
+					hash[:type] = :mastersthesis
+				when text =~ /unpublished/i
+					hash[:type] = :unpublished
+				else
+					hash[:type] = :misc
+				end
+				hash
+			end
+			private
+			def input_to_s(input)
+				case input
+				when String
+					if input.length < 128 && File.exists?(input)
+						f = File.open(input, 'r:UTF-8')
+						f.read
+					else
+						input
+					end
+				when Array
+					input.join("\n")
+				else
+					raise ArgumentError, "invalid input: #{input.class}"
+				end
+			ensure
+				f.close if f
+			end
+			def features_for(*arguments)
+				Parser.features.map { |f| f.match(*arguments) }
+			end
+			def strip(token)
+				token.gsub(options[:strip], '')
+			end
+			def format_bibtex(labels)
+				b = BibTeX::Bibliography.new
+				format_hash(labels).each do |hash|
+					b << BibTeX::Entry.new(hash)
+				end
+				b
+			end
+			def format_hash(labels)
+				labels.map do |line|
+					hash = line.inject({}) do |h, (label, token)|
+						if h.has_key?(label)
+							h[label] = [h[label]].flatten << token
+						else
+							h[label] = token
+						end
+						h
+					end
+					normalize hash
+				end
+			end
+			def format_citeproc(labels)
+				format_bibtex(labels).to_citeproc
+			end
+		end
+	end
+end