excite 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ log
2
+ .DS_Store
3
+ tmp/
4
+ *swp
5
+ lib/excite/resources/trainingdata/training_data.txt
6
+ model/test/testing_data*txt
7
+ model/test/testing_data*txt
8
+ model/test/training_refs*txt
9
+ model/test/training_data*txt
10
+ model/test/model
11
+ *~
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --require spec_helper --color
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # encoding: UTF-8
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in the gemspec
6
+ gemspec
7
+
8
+ gem 'engtagger', github: 'academia-edu/engtagger'
data/Gemfile.lock ADDED
@@ -0,0 +1,69 @@
1
+ GIT
2
+ remote: git://github.com/academia-edu/engtagger.git
3
+ revision: 89af3ad6d8299f80e25dad64a4fd49e6a991abaf
4
+ specs:
5
+ engtagger (0.1.2)
6
+
7
+ PATH
8
+ remote: .
9
+ specs:
10
+ excite (2.1.0)
11
+ activesupport
12
+ crfpp
13
+ nokogiri
14
+ openurl
15
+
16
+ GEM
17
+ remote: https://rubygems.org/
18
+ specs:
19
+ activesupport (3.2.12)
20
+ i18n (~> 0.6)
21
+ multi_json (~> 1.0)
22
+ coderay (1.0.8)
23
+ columnize (0.3.6)
24
+ crfpp (0.0.4)
25
+ debugger (1.2.3)
26
+ columnize (>= 0.3.1)
27
+ debugger-linecache (~> 1.1.1)
28
+ debugger-ruby_core_source (~> 1.1.5)
29
+ debugger-linecache (1.1.2)
30
+ debugger-ruby_core_source (>= 1.1.1)
31
+ debugger-ruby_core_source (1.1.6)
32
+ diff-lcs (1.1.3)
33
+ ensure_valid_encoding (0.5.3)
34
+ i18n (0.6.4)
35
+ marc (0.5.0)
36
+ method_source (0.8.1)
37
+ multi_json (1.6.1)
38
+ nokogiri (1.5.6)
39
+ openurl (0.4.2)
40
+ ensure_valid_encoding
41
+ marc
42
+ pry (0.9.10)
43
+ coderay (~> 1.0.5)
44
+ method_source (~> 0.8)
45
+ slop (~> 3.3.1)
46
+ pry-debugger (0.2.1)
47
+ debugger (~> 1.2.0)
48
+ pry (~> 0.9.10)
49
+ rake (10.0.3)
50
+ rspec (2.12.0)
51
+ rspec-core (~> 2.12.0)
52
+ rspec-expectations (~> 2.12.0)
53
+ rspec-mocks (~> 2.12.0)
54
+ rspec-core (2.12.2)
55
+ rspec-expectations (2.12.1)
56
+ diff-lcs (~> 1.1.3)
57
+ rspec-mocks (2.12.1)
58
+ slop (3.3.3)
59
+
60
+ PLATFORMS
61
+ ruby
62
+
63
+ DEPENDENCIES
64
+ engtagger!
65
+ excite!
66
+ pry
67
+ pry-debugger
68
+ rake
69
+ rspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2008 Public Display Inc
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Excite
2
+
3
+ Provides a simple Ruby API for parsing citations from plain text strings or HTML.
4
+
5
+ ## Usage
6
+
7
+ ```ruby
8
+ require 'excite'
9
+
10
+ Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
11
+
12
+ Excite.parse_html("<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>")
13
+ ```
14
+
15
+ ## History and Credits
16
+
17
+ Derived from [FreeCite](http://freecite.library.brown.edu/), minus Rails and all UI elements. The most up-to-date fork of FreeCite of which I am aware is [rsinger's](https://github.com/rsinger/free_cite). FreeCite in turn is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/).
18
+
19
+ The main changes are:
20
+ * No UI, just a gem;
21
+ * New model for parsing HTML;
22
+ * Tokenization and part-of-speech features from [EngTagger](https://github.com/yohasebe/engtagger).
23
+
24
+ Credit is due to the authors of all the linked projects, as well as Laura Durkay who marked up the HTML training data.
25
+
26
+ ## Install required packages
27
+
28
+ ### From source
29
+
30
+ wget http://crfpp.googlecode.com/files/CRF%2B%2B-0.57.tar.gz
31
+ tar xvzf CRF++-0.57.tar.gz
32
+ cd CRF++-0.57
33
+ ./configure
34
+ make
35
+ sudo make install
36
+
37
+ ### On Ubuntu
38
+
39
+ sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
40
+ sudo apt-get update
41
+ sudo apt-get install libcrf++
42
+
43
+ ### On OS X with Homebrew
44
+
45
+ brew install crf++
46
+
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rake'
4
+ require 'bundler/gem_tasks'
5
+ require 'rspec/core/rake_task'
6
+ require 'excite'
7
+
8
+ RSpec::Core::RakeTask.new :spec
9
+
10
+ namespace :crfparser do
11
+ desc 'train a CRF model for the citation parser'
12
+ task :train_model, :type do |_, args|
13
+ mode = args[:type] ? args[:type].to_sym : :string
14
+ Excite::CRFParser.new(mode).train
15
+ end
16
+
17
+ desc 'test a CRF model with the current training set & features'
18
+ task :test_model, :type do |_, args|
19
+ require "#{File.dirname(__FILE__)}/model/test/model_test"
20
+ mode = args[:type] ? args[:type].to_sym : :string
21
+ Excite::ModelTest.new(mode).run_test
22
+ end
23
+ end
24
+
@@ -0,0 +1,68 @@
1
+ # The rules consist of two parts: the rules, which is a hash of regular expression rules and an array to declare what order the rules should be run in.
2
+ # The keys of each rules hash are:
3
+ # regex (required): a string representation of the regular expression rule
4
+ # ignore_case (optional): a boolean to define whether or not the regex is case insensitive (the "i" flag)
5
+ # replacement_str (optional): the replacement string. Absence of this attribute assumes "" (empty string)
6
+
7
+
8
+ order:
9
+ - sub_quotes_0
10
+ - sub_quotes_1
11
+ - strip_quotes_0
12
+
13
+ # TODO Figure out which rules might be helpful, and which are dumber versions of model features
14
+ rules:
15
+ vol_0:
16
+ # Description: be sure a whitespace exists between string 'vol.' and the actual volume number
17
+ regex: \svol\.([A-z0-9]*)
18
+ ignore_case: true
19
+ replacement_str: ' vol. \1'
20
+ vol_1:
21
+ # Description: replace 'Vol' with 'vol.'
22
+ regex: \sVol\W
23
+ replacement_str: ' vol. '
24
+ vol_2:
25
+ # Description: replace 'Volume' with 'vol.' (lowercased 'volume' seems fine)
26
+ regex: \sVolume\W
27
+ replacement_str: ' vol. '
28
+ year_0:
29
+ # Description: Years generally appear in the form (YYYY), remove any variants ((YYYY/YY), etc.)
30
+ regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
31
+ replacement_str: ' (\1) '
32
+ number_range_0:
33
+ # Description: Ranges (generally pages) can't have spaces between the hyphen
34
+ regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
35
+ replacement_str: ' (\1) '
36
+ vol_iss_0:
37
+ # Description: Volume and issue sometimes appear as '123, (4)' or '123 (4)': rewrite to '123:4'
38
+ regex: \s([0-9]{1,3}),?\s\(([0-9]+)\)
39
+ replacement_str: ' \1:\2'
40
+ iss_0:
41
+ # Description: 'Issue' needs to be lowercased
42
+ regex: \sIssue\b
43
+ replacement_str: ' issue'
44
+ eresource_0:
45
+ # Description: remove the junk about it being an electronic copy
46
+ regex: \s?\[electronic (journal|resource|version)\]
47
+ ignore_case: true
48
+ eresource_1:
49
+ # Description: variant cruft about it being an electronic copy
50
+ regex: \s?\[online\]
51
+ ignore_case: true
52
+ leading_source_0:
53
+ # Description: Many citations lead with the database they were found in: 'JSTOR', 'EBSCO', etc. These need to be removed
54
+ regex: ^(JSTOR|Blackwell Synergy)\W*
55
+ ignore_case: true
56
+ sub_quotes_0:
57
+ # Replace non-ASCII double quotation marks
58
+ regex: "[“”‟„"〝〞〟]"
59
+ replacement_str: '"'
60
+ sub_quotes_1:
61
+ # Replace non-ASCII single quotation marks
62
+ regex: "[‘’‛‚'`]"
63
+ replacement_str: "'"
64
+ strip_quotes_0:
65
+ # Remove leading and trailing quotes from citation string
66
+ regex: ^\s*\"(.*)\"\s*$
67
+ replacement_str: \1
68
+
@@ -0,0 +1,55 @@
1
+ # Specify feature print-order here, and refer to the associated numbers
2
+ # when writing the template file
3
+ string:
4
+ feature_order:
5
+ - 'last_char' # 1
6
+ - 'first_1_char' # 2
7
+ - 'first_2_chars' # 3
8
+ - 'first_3_chars' # 4
9
+ - 'first_4_chars' # 5
10
+ - 'last_1_char' # 6
11
+ - 'last_2_chars' # 7
12
+ - 'last_3_chars' # 8
13
+ - 'last_4_chars' # 9
14
+ - 'toklcnp' # 10
15
+ - 'capitalization' # 11
16
+ - 'numbers' # 12
17
+ - 'a_is_in_dict' # 13
18
+ - 'firstName' # 14
19
+ - 'lastName' # 15
20
+ - 'monthName' # 16
21
+ - 'placeName' # 17
22
+ - 'publisherName' # 18
23
+ - 'possible_editor' # 19
24
+ - 'location' # 20
25
+ - 'punct' # 21
26
+ - 'is_in' # 22
27
+ - 'part_of_speech' # 23
28
+
29
+ html:
30
+ feature_order:
31
+ - 'last_char' # 1
32
+ - 'first_1_char' # 2
33
+ - 'first_2_chars' # 3
34
+ - 'first_3_chars' # 4
35
+ - 'first_4_chars' # 5
36
+ - 'last_1_char' # 6
37
+ - 'last_2_chars' # 7
38
+ - 'last_3_chars' # 8
39
+ - 'last_4_chars' # 9
40
+ - 'toklcnp' # 10
41
+ - 'capitalization' # 11
42
+ - 'numbers' # 12
43
+ - 'a_is_in_dict' # 13
44
+ - 'firstName' # 14
45
+ - 'lastName' # 15
46
+ - 'monthName' # 16
47
+ - 'placeName' # 17
48
+ - 'publisherName' # 18
49
+ - 'possible_editor' # 19
50
+ - 'location' # 20
51
+ - 'punct' # 21
52
+ - 'is_in' # 22
53
+ - 'tag_name' # 23
54
+ - 'location_in_node' # 24
55
+ - 'part_of_speech' # 25
data/excite.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require 'excite/version'
7
+
8
+ Gem::Specification.new do |gem|
9
+ gem.name = "excite"
10
+ gem.version = Excite::VERSION
11
+ gem.authors = ["David Judd"]
12
+ gem.email = ["david@academia.edu"]
13
+ gem.summary = %q{Parse citations}
14
+ gem.homepage = "http://github.com/academia-edu/free_cite"
15
+
16
+ gem.files = `git ls-files`.split($\)
17
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
18
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
19
+ gem.require_paths = ["lib"]
20
+
21
+ gem.add_dependency 'openurl'
22
+ gem.add_dependency 'activesupport'
23
+ gem.add_dependency 'crfpp'
24
+ gem.add_dependency 'nokogiri'
25
+
26
+ gem.add_development_dependency 'rake'
27
+ gem.add_development_dependency 'rspec'
28
+ gem.add_development_dependency 'pry'
29
+ gem.add_development_dependency 'pry-debugger'
30
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: UTF-8
2
+
3
+ class Array
4
+ def mean
5
+ (size > 0) ? sum.to_f / size : 0
6
+ end
7
+
8
+ def stddev
9
+ m = mean
10
+ devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
11
+ (size > 0) ? (devsum.to_f / size) ** 0.5 : 0
12
+ end
13
+
14
+ def cov(other)
15
+ zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
16
+ end
17
+
18
+ def pearson_r(other)
19
+ unless size == other.size
20
+ raise "Vectors must be of same length to calculate pearson_r"
21
+ end
22
+ devp = stddev * other.stddev
23
+ (devp > 0) ? cov(other) / devp : 0.0
24
+ end
25
+
26
+ end
27
+
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'active_support/core_ext/object'
4
+
5
+ module Excite
6
+
7
+ # parse a string into a citation
8
+ # optionally pass the presumed author
9
+ def self.parse_string(str, author=nil)
10
+ if str.present?
11
+ Citation.new(str, string_parser, author)
12
+ end
13
+ end
14
+
15
+ class << self
16
+ alias_method :parse, :parse_string # for backwards compatibility
17
+ end
18
+
19
+ def self.parse_html(html, author=nil)
20
+ if html.present?
21
+ Citation.new(html, html_parser, author)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def self.string_parser
28
+ Thread.current[:string_crf_parser] ||= CRFParser.new(:string)
29
+ end
30
+
31
+ def self.html_parser
32
+ Thread.current[:html_crf_parser] ||= CRFParser.new(:html)
33
+ end
34
+
35
+ class Citation < Hash
36
+
37
+ attr_accessor :probabilities, :overall_probability
38
+
39
+ def initialize(str, parser, author=nil)
40
+ raw_hash, overall_prob, tag_probs = parser.parse(str, author)
41
+ self.replace(raw_hash.symbolize_keys)
42
+ @probabilities = tag_probs.symbolize_keys
43
+ @overall_probability = overall_prob
44
+ end
45
+
46
+ end
47
+
48
+ end