RubyGems - excite - Versions diffs - 2.1.1 - Mend

excite 2.1.1

Files changed (44) hide show

data/.gitignore +11 -0
data/.rspec +1 -0
data/Gemfile +8 -0
data/Gemfile.lock +69 -0
data/LICENSE +22 -0
data/README.md +46 -0
data/Rakefile +24 -0
data/config/citation_cleanup_rules.yml +68 -0
data/config/parscit_features.yml +55 -0
data/excite.gemspec +30 -0
data/lib/excite/array_helpers.rb +27 -0
data/lib/excite/citation.rb +48 -0
data/lib/excite/crfparser.rb +322 -0
data/lib/excite/postprocessor.rb +252 -0
data/lib/excite/preprocessor.rb +107 -0
data/lib/excite/resources/dicts/female-names +4954 -0
data/lib/excite/resources/dicts/first-names +27926 -0
data/lib/excite/resources/dicts/male-names +3901 -0
data/lib/excite/resources/dicts/months +24 -0
data/lib/excite/resources/dicts/places +43109 -0
data/lib/excite/resources/dicts/publishers +654 -0
data/lib/excite/resources/dicts/surnames +146259 -0
data/lib/excite/resources/html.template +84 -0
data/lib/excite/resources/html_model +0 -0
data/lib/excite/resources/model +0 -0
data/lib/excite/resources/parsCit.template +76 -0
data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
data/lib/excite/resources/trainingdata/verify.rb +97 -0
data/lib/excite/token_features.rb +313 -0
data/lib/excite/version.rb +7 -0
data/lib/excite.rb +13 -0
data/model/test/analysis.csv +54 -0
data/model/test/array_helpers.rb +30 -0
data/model/test/html-analysis.csv +60 -0
data/model/test/html-output.txt +19893 -0
data/model/test/model_test.rb +306 -0
data/model/test/output.txt +16742 -0
data/spec/excite/citation_spec.rb +128 -0
data/spec/excite/crfparser_spec.rb +118 -0
data/spec/excite/postprocessor_spec.rb +68 -0
data/spec/excite/token_features_spec.rb +641 -0
data/spec/spec_helper.rb +4 -0
metadata +222 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,11 @@
+log
+.DS_Store
+tmp/
+*swp
+lib/excite/resources/trainingdata/training_data.txt
+model/test/testing_data*txt
+model/test/testing_data*txt
+model/test/training_refs*txt
+model/test/training_data*txt
+model/test/model
+*~

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --require spec_helper --color

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+# encoding: UTF-8
+source 'https://rubygems.org'
+# Specify your gem's dependencies in the gemspec
+gemspec
+gem 'engtagger', github: 'academia-edu/engtagger'

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,69 @@
+GIT
+  remote: git://github.com/academia-edu/engtagger.git
+  revision: 89af3ad6d8299f80e25dad64a4fd49e6a991abaf
+  specs:
+    engtagger (0.1.2)
+PATH
+  remote: .
+  specs:
+    excite (2.1.0)
+      activesupport
+      crfpp
+      nokogiri
+      openurl
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (3.2.12)
+      i18n (~> 0.6)
+      multi_json (~> 1.0)
+    coderay (1.0.8)
+    columnize (0.3.6)
+    crfpp (0.0.4)
+    debugger (1.2.3)
+      columnize (>= 0.3.1)
+      debugger-linecache (~> 1.1.1)
+      debugger-ruby_core_source (~> 1.1.5)
+    debugger-linecache (1.1.2)
+      debugger-ruby_core_source (>= 1.1.1)
+    debugger-ruby_core_source (1.1.6)
+    diff-lcs (1.1.3)
+    ensure_valid_encoding (0.5.3)
+    i18n (0.6.4)
+    marc (0.5.0)
+    method_source (0.8.1)
+    multi_json (1.6.1)
+    nokogiri (1.5.6)
+    openurl (0.4.2)
+      ensure_valid_encoding
+      marc
+    pry (0.9.10)
+      coderay (~> 1.0.5)
+      method_source (~> 0.8)
+      slop (~> 3.3.1)
+    pry-debugger (0.2.1)
+      debugger (~> 1.2.0)
+      pry (~> 0.9.10)
+    rake (10.0.3)
+    rspec (2.12.0)
+      rspec-core (~> 2.12.0)
+      rspec-expectations (~> 2.12.0)
+      rspec-mocks (~> 2.12.0)
+    rspec-core (2.12.2)
+    rspec-expectations (2.12.1)
+      diff-lcs (~> 1.1.3)
+    rspec-mocks (2.12.1)
+    slop (3.3.3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  engtagger!
+  excite!
+  pry
+  pry-debugger
+  rake
+  rspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+The MIT License
+Copyright (c) 2008 Public Display Inc
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# Excite
+Provides a simple Ruby API for parsing citations from plain text strings or HTML.
+## Usage
+```ruby
+  require 'excite'
+  Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
+  Excite.parse_html("<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>")
+```
+## History and Credits
+Derived from [FreeCite](http://freecite.library.brown.edu/), minus Rails and all UI elements. The most up-to-date fork of FreeCite of which I am aware is [rsinger's](https://github.com/rsinger/free_cite). FreeCite in turn is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/).
+The main changes are:
+* No UI, just a gem;
+* New model for parsing HTML;
+* Tokenization and part-of-speech features from [EngTagger](https://github.com/yohasebe/engtagger).
+Credit is due to the authors of all the linked projects, as well as Laura Durkay who marked up the HTML training data.
+## Install required packages
+### From source
+    wget http://crfpp.googlecode.com/files/CRF%2B%2B-0.57.tar.gz
+    tar xvzf CRF++-0.57.tar.gz
+    cd CRF++-0.57
+    ./configure
+    make
+    sudo make install
+### On Ubuntu
+    sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
+    sudo apt-get update
+    sudo apt-get install libcrf++
+### On OS X with Homebrew
+    brew install crf++

data/Rakefile ADDED Viewed

@@ -0,0 +1,24 @@
+# encoding: UTF-8
+require 'rake'
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'excite'
+RSpec::Core::RakeTask.new :spec
+namespace :crfparser do
+  desc 'train a CRF model for the citation parser'
+  task :train_model, :type do |_, args|
+    mode = args[:type] ? args[:type].to_sym : :string
+    Excite::CRFParser.new(mode).train
+  end
+  desc 'test a CRF model with the current training set & features'
+  task :test_model, :type do |_, args|
+    require "#{File.dirname(__FILE__)}/model/test/model_test"
+    mode = args[:type] ? args[:type].to_sym : :string
+    Excite::ModelTest.new(mode).run_test
+  end
+end

data/config/citation_cleanup_rules.yml ADDED Viewed

@@ -0,0 +1,68 @@
+# The rules consist of two parts: the rules, which is a hash of regular expression rules and an array to declare what order the rules should be run in.
+# The keys of each rules hash are:
+#    regex (required): a string representation of the regular expression rule
+#    ignore_case (optional): a boolean to define whether or not the regex is case insensitive (the "i" flag)
+#    replacement_str (optional): the replacement string.  Absence of this attribute assumes "" (empty string)
+order:
+    - sub_quotes_0
+    - sub_quotes_1
+    - strip_quotes_0
+# TODO Figure out which rules might be helpful, and which are dumber versions of model features
+rules:
+    vol_0:
+        # Description: be sure a whitespace exists between string 'vol.' and the actual volume number
+        regex: \svol\.([A-z0-9]*)
+        ignore_case: true
+        replacement_str: ' vol. \1'
+    vol_1:
+        # Description: replace 'Vol' with 'vol.'
+        regex: \sVol\W
+        replacement_str: ' vol. '
+    vol_2:
+        # Description: replace 'Volume' with 'vol.' (lowercased 'volume' seems fine)
+        regex: \sVolume\W
+        replacement_str: ' vol. '
+    year_0:
+        # Description: Years generally appear in the form (YYYY), remove any variants ((YYYY/YY), etc.)
+        regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
+        replacement_str: ' (\1) '
+    number_range_0:
+        # Description: Ranges (generally pages) can't have spaces between the hyphen
+        regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
+        replacement_str: ' (\1) '
+    vol_iss_0:
+        # Description: Volume and issue sometimes appear as '123, (4)' or '123 (4)': rewrite to '123:4'
+        regex: \s([0-9]{1,3}),?\s\(([0-9]+)\)
+        replacement_str: ' \1:\2'
+    iss_0:
+        # Description: 'Issue' needs to be lowercased
+        regex: \sIssue\b
+        replacement_str: ' issue'
+    eresource_0:
+        # Description: remove the junk about it being an electronic copy
+        regex: \s?\[electronic (journal|resource|version)\]
+        ignore_case: true
+    eresource_1:
+        # Description: variant cruft about it being an electronic copy
+        regex: \s?\[online\]
+        ignore_case: true
+    leading_source_0:
+        # Description: Many citations lead with the database they were found in: 'JSTOR', 'EBSCO', etc.  These need to be removed
+        regex: ^(JSTOR|Blackwell Synergy)\W*
+        ignore_case: true
+    sub_quotes_0:
+        # Replace non-ASCII double quotation marks
+        regex: "[“”‟„＂〝〞〟]"
+        replacement_str: '"'
+    sub_quotes_1:
+        # Replace non-ASCII single quotation marks
+        regex: "[‘’‛‚＇`]"
+        replacement_str: "'"
+    strip_quotes_0:
+        # Remove leading and trailing quotes from citation string
+        regex: ^\s*\"(.*)\"\s*$
+        replacement_str: \1

data/config/parscit_features.yml ADDED Viewed

@@ -0,0 +1,55 @@
+# Specify feature print-order here, and refer to the associated numbers
+# when writing the template file
+string:
+  feature_order:
+    - 'last_char'        # 1
+    - 'first_1_char'     # 2
+    - 'first_2_chars'    # 3
+    - 'first_3_chars'    # 4
+    - 'first_4_chars'    # 5
+    - 'last_1_char'      # 6
+    - 'last_2_chars'     # 7
+    - 'last_3_chars'     # 8
+    - 'last_4_chars'     # 9
+    - 'toklcnp'          # 10
+    - 'capitalization'   # 11
+    - 'numbers'          # 12
+    - 'a_is_in_dict'     # 13
+    - 'firstName'        # 14
+    - 'lastName'         # 15
+    - 'monthName'        # 16
+    - 'placeName'        # 17
+    - 'publisherName'    # 18
+    - 'possible_editor'  # 19
+    - 'location'         # 20
+    - 'punct'            # 21
+    - 'is_in'            # 22
+    - 'part_of_speech'   # 23
+html:
+  feature_order:
+    - 'last_char'        # 1
+    - 'first_1_char'     # 2
+    - 'first_2_chars'    # 3
+    - 'first_3_chars'    # 4
+    - 'first_4_chars'    # 5
+    - 'last_1_char'      # 6
+    - 'last_2_chars'     # 7
+    - 'last_3_chars'     # 8
+    - 'last_4_chars'     # 9
+    - 'toklcnp'          # 10
+    - 'capitalization'   # 11
+    - 'numbers'          # 12
+    - 'a_is_in_dict'     # 13
+    - 'firstName'        # 14
+    - 'lastName'         # 15
+    - 'monthName'        # 16
+    - 'placeName'        # 17
+    - 'publisherName'    # 18
+    - 'possible_editor'  # 19
+    - 'location'         # 20
+    - 'punct'            # 21
+    - 'is_in'            # 22
+    - 'tag_name'         # 23
+    - 'location_in_node' # 24
+    - 'part_of_speech'   # 25

data/excite.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'excite/version'
+Gem::Specification.new do |gem|
+  gem.name          = "excite"
+  gem.version       = Excite::VERSION
+  gem.authors       = ["David Judd"]
+  gem.email         = ["david@academia.edu"]
+  gem.summary       = %q{Parse citations}
+  gem.homepage      = "http://github.com/academia-edu/free_cite"
+  gem.files         = `git ls-files`.split($\)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.add_dependency 'openurl'
+  gem.add_dependency 'activesupport'
+  gem.add_dependency 'crfpp'
+  gem.add_dependency 'nokogiri'
+  gem.add_development_dependency 'rake'
+  gem.add_development_dependency 'rspec'
+  gem.add_development_dependency 'pry'
+  gem.add_development_dependency 'pry-debugger'
+end

data/lib/excite/array_helpers.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# encoding: UTF-8
+class Array
+  def mean
+    (size > 0) ? sum.to_f / size : 0
+  end
+  def stddev
+    m = mean
+    devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
+    (size > 0) ? (devsum.to_f / size) ** 0.5 : 0
+  end
+  def cov(other)
+    zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
+  end
+  def pearson_r(other)
+    unless size == other.size
+      raise "Vectors must be of same length to calculate pearson_r"
+    end
+    devp = stddev * other.stddev
+    (devp > 0) ? cov(other) / devp : 0.0
+  end
+end

data/lib/excite/citation.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# encoding: UTF-8
+require 'active_support/core_ext/object'
+module Excite
+  # parse a string into a citation
+  # optionally pass the presumed author
+  def self.parse_string(str, author=nil)
+    if str.present?
+      Citation.new(str, string_parser, author)
+    end
+  end
+  class << self
+    alias_method :parse, :parse_string # for backwards compatibility
+  end
+  def self.parse_html(html, author=nil)
+    if html.present?
+      Citation.new(html, html_parser, author)
+    end
+  end
+private
+  def self.string_parser
+    Thread.current[:string_crf_parser] ||= CRFParser.new(:string)
+  end
+  def self.html_parser
+    Thread.current[:html_crf_parser] ||= CRFParser.new(:html)
+  end
+  class Citation < Hash
+    attr_accessor :probabilities, :overall_probability
+    def initialize(str, parser, author=nil)
+      raw_hash, overall_prob, tag_probs = parser.parse(str, author)
+      self.replace(raw_hash.symbolize_keys)
+      @probabilities = tag_probs.symbolize_keys
+      @overall_probability = overall_prob
+    end
+  end
+end