excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ log
2
+ .DS_Store
3
+ tmp/
4
+ *swp
5
+ lib/excite/resources/trainingdata/training_data.txt
6
+ model/test/testing_data*txt
7
+ model/test/testing_data*txt
8
+ model/test/training_refs*txt
9
+ model/test/training_data*txt
10
+ model/test/model
11
+ *~
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --require spec_helper --color
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # encoding: UTF-8
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in the gemspec
6
+ gemspec
7
+
8
+ gem 'engtagger', github: 'academia-edu/engtagger'
data/Gemfile.lock ADDED
@@ -0,0 +1,69 @@
1
+ GIT
2
+ remote: git://github.com/academia-edu/engtagger.git
3
+ revision: 89af3ad6d8299f80e25dad64a4fd49e6a991abaf
4
+ specs:
5
+ engtagger (0.1.2)
6
+
7
+ PATH
8
+ remote: .
9
+ specs:
10
+ excite (2.1.0)
11
+ activesupport
12
+ crfpp
13
+ nokogiri
14
+ openurl
15
+
16
+ GEM
17
+ remote: https://rubygems.org/
18
+ specs:
19
+ activesupport (3.2.12)
20
+ i18n (~> 0.6)
21
+ multi_json (~> 1.0)
22
+ coderay (1.0.8)
23
+ columnize (0.3.6)
24
+ crfpp (0.0.4)
25
+ debugger (1.2.3)
26
+ columnize (>= 0.3.1)
27
+ debugger-linecache (~> 1.1.1)
28
+ debugger-ruby_core_source (~> 1.1.5)
29
+ debugger-linecache (1.1.2)
30
+ debugger-ruby_core_source (>= 1.1.1)
31
+ debugger-ruby_core_source (1.1.6)
32
+ diff-lcs (1.1.3)
33
+ ensure_valid_encoding (0.5.3)
34
+ i18n (0.6.4)
35
+ marc (0.5.0)
36
+ method_source (0.8.1)
37
+ multi_json (1.6.1)
38
+ nokogiri (1.5.6)
39
+ openurl (0.4.2)
40
+ ensure_valid_encoding
41
+ marc
42
+ pry (0.9.10)
43
+ coderay (~> 1.0.5)
44
+ method_source (~> 0.8)
45
+ slop (~> 3.3.1)
46
+ pry-debugger (0.2.1)
47
+ debugger (~> 1.2.0)
48
+ pry (~> 0.9.10)
49
+ rake (10.0.3)
50
+ rspec (2.12.0)
51
+ rspec-core (~> 2.12.0)
52
+ rspec-expectations (~> 2.12.0)
53
+ rspec-mocks (~> 2.12.0)
54
+ rspec-core (2.12.2)
55
+ rspec-expectations (2.12.1)
56
+ diff-lcs (~> 1.1.3)
57
+ rspec-mocks (2.12.1)
58
+ slop (3.3.3)
59
+
60
+ PLATFORMS
61
+ ruby
62
+
63
+ DEPENDENCIES
64
+ engtagger!
65
+ excite!
66
+ pry
67
+ pry-debugger
68
+ rake
69
+ rspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2008 Public Display Inc
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Excite
2
+
3
+ Provides a simple Ruby API for parsing citations from plain text strings or HTML.
4
+
5
+ ## Usage
6
+
7
+ ```ruby
8
+ require 'excite'
9
+
10
+ Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
11
+
12
+ Excite.parse_html("<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>")
13
+ ```
14
+
15
+ ## History and Credits
16
+
17
+ Derived from [FreeCite](http://freecite.library.brown.edu/), minus Rails and all UI elements. The most up-to-date fork of FreeCite of which I am aware is [rsinger's](https://github.com/rsinger/free_cite). FreeCite in turn is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/).
18
+
19
+ The main changes are:
20
+ * No UI, just a gem;
21
+ * New model for parsing HTML;
22
+ * Tokenization and part-of-speech features from [EngTagger](https://github.com/yohasebe/engtagger).
23
+
24
+ Credit is due to the authors of all the linked projects, as well as Laura Durkay who marked up the HTML training data.
25
+
26
+ ## Install required packages
27
+
28
+ ### From source
29
+
30
+ wget http://crfpp.googlecode.com/files/CRF%2B%2B-0.57.tar.gz
31
+ tar xvzf CRF++-0.57.tar.gz
32
+ cd CRF++-0.57
33
+ ./configure
34
+ make
35
+ sudo make install
36
+
37
+ ### On Ubuntu
38
+
39
+ sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
40
+ sudo apt-get update
41
+ sudo apt-get install libcrf++
42
+
43
+ ### On OS X with Homebrew
44
+
45
+ brew install crf++
46
+
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rake'
4
+ require 'bundler/gem_tasks'
5
+ require 'rspec/core/rake_task'
6
+ require 'excite'
7
+
8
+ RSpec::Core::RakeTask.new :spec
9
+
10
+ namespace :crfparser do
11
+ desc 'train a CRF model for the citation parser'
12
+ task :train_model, :type do |_, args|
13
+ mode = args[:type] ? args[:type].to_sym : :string
14
+ Excite::CRFParser.new(mode).train
15
+ end
16
+
17
+ desc 'test a CRF model with the current training set & features'
18
+ task :test_model, :type do |_, args|
19
+ require "#{File.dirname(__FILE__)}/model/test/model_test"
20
+ mode = args[:type] ? args[:type].to_sym : :string
21
+ Excite::ModelTest.new(mode).run_test
22
+ end
23
+ end
24
+
@@ -0,0 +1,68 @@
1
+ # The rules consist of two parts: the rules, which is a hash of regular expression rules and an array to declare what order the rules should be run in.
2
+ # The keys of each rules hash are:
3
+ # regex (required): a string representation of the regular expression rule
4
+ # ignore_case (optional): a boolean to define whether or not the regex is case insensitive (the "i" flag)
5
+ # replacement_str (optional): the replacement string. Absence of this attribute assumes "" (empty string)
6
+
7
+
8
+ order:
9
+ - sub_quotes_0
10
+ - sub_quotes_1
11
+ - strip_quotes_0
12
+
13
+ # TODO Figure out which rules might be helpful, and which are dumber versions of model features
14
+ rules:
15
+ vol_0:
16
+ # Description: be sure a whitespace exists between string 'vol.' and the actual volume number
17
+ regex: \svol\.([A-z0-9]*)
18
+ ignore_case: true
19
+ replacement_str: ' vol. \1'
20
+ vol_1:
21
+ # Description: replace 'Vol' with 'vol.'
22
+ regex: \sVol\W
23
+ replacement_str: ' vol. '
24
+ vol_2:
25
+ # Description: replace 'Volume' with 'vol.' (lowercased 'volume' seems fine)
26
+ regex: \sVolume\W
27
+ replacement_str: ' vol. '
28
+ year_0:
29
+ # Description: Years generally appear in the form (YYYY), remove any variants ((YYYY/YY), etc.)
30
+ regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
31
+ replacement_str: ' (\1) '
32
+ number_range_0:
33
+ # Description: Ranges (generally pages) can't have spaces between the hyphen
34
+ regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
35
+ replacement_str: ' (\1) '
36
+ vol_iss_0:
37
+ # Description: Volume and issue sometimes appear as '123, (4)' or '123 (4)': rewrite to '123:4'
38
+ regex: \s([0-9]{1,3}),?\s\(([0-9]+)\)
39
+ replacement_str: ' \1:\2'
40
+ iss_0:
41
+ # Description: 'Issue' needs to be lowercased
42
+ regex: \sIssue\b
43
+ replacement_str: ' issue'
44
+ eresource_0:
45
+ # Description: remove the junk about it being an electronic copy
46
+ regex: \s?\[electronic (journal|resource|version)\]
47
+ ignore_case: true
48
+ eresource_1:
49
+ # Description: variant cruft about it being an electronic copy
50
+ regex: \s?\[online\]
51
+ ignore_case: true
52
+ leading_source_0:
53
+ # Description: Many citations lead with the database they were found in: 'JSTOR', 'EBSCO', etc. These need to be removed
54
+ regex: ^(JSTOR|Blackwell Synergy)\W*
55
+ ignore_case: true
56
+ sub_quotes_0:
57
+ # Replace non-ASCII double quotation marks
58
+ regex: "[“”‟„"〝〞〟]"
59
+ replacement_str: '"'
60
+ sub_quotes_1:
61
+ # Replace non-ASCII single quotation marks
62
+ regex: "[‘’‛‚'`]"
63
+ replacement_str: "'"
64
+ strip_quotes_0:
65
+ # Remove leading and trailing quotes from citation string
66
+ regex: ^\s*\"(.*)\"\s*$
67
+ replacement_str: \1
68
+
@@ -0,0 +1,55 @@
1
+ # Specify feature print-order here, and refer to the associated numbers
2
+ # when writing the template file
3
+ string:
4
+ feature_order:
5
+ - 'last_char' # 1
6
+ - 'first_1_char' # 2
7
+ - 'first_2_chars' # 3
8
+ - 'first_3_chars' # 4
9
+ - 'first_4_chars' # 5
10
+ - 'last_1_char' # 6
11
+ - 'last_2_chars' # 7
12
+ - 'last_3_chars' # 8
13
+ - 'last_4_chars' # 9
14
+ - 'toklcnp' # 10
15
+ - 'capitalization' # 11
16
+ - 'numbers' # 12
17
+ - 'a_is_in_dict' # 13
18
+ - 'firstName' # 14
19
+ - 'lastName' # 15
20
+ - 'monthName' # 16
21
+ - 'placeName' # 17
22
+ - 'publisherName' # 18
23
+ - 'possible_editor' # 19
24
+ - 'location' # 20
25
+ - 'punct' # 21
26
+ - 'is_in' # 22
27
+ - 'part_of_speech' # 23
28
+
29
+ html:
30
+ feature_order:
31
+ - 'last_char' # 1
32
+ - 'first_1_char' # 2
33
+ - 'first_2_chars' # 3
34
+ - 'first_3_chars' # 4
35
+ - 'first_4_chars' # 5
36
+ - 'last_1_char' # 6
37
+ - 'last_2_chars' # 7
38
+ - 'last_3_chars' # 8
39
+ - 'last_4_chars' # 9
40
+ - 'toklcnp' # 10
41
+ - 'capitalization' # 11
42
+ - 'numbers' # 12
43
+ - 'a_is_in_dict' # 13
44
+ - 'firstName' # 14
45
+ - 'lastName' # 15
46
+ - 'monthName' # 16
47
+ - 'placeName' # 17
48
+ - 'publisherName' # 18
49
+ - 'possible_editor' # 19
50
+ - 'location' # 20
51
+ - 'punct' # 21
52
+ - 'is_in' # 22
53
+ - 'tag_name' # 23
54
+ - 'location_in_node' # 24
55
+ - 'part_of_speech' # 25
data/excite.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require 'excite/version'
7
+
8
+ Gem::Specification.new do |gem|
9
+ gem.name = "excite"
10
+ gem.version = Excite::VERSION
11
+ gem.authors = ["David Judd"]
12
+ gem.email = ["david@academia.edu"]
13
+ gem.summary = %q{Parse citations}
14
+ gem.homepage = "http://github.com/academia-edu/free_cite"
15
+
16
+ gem.files = `git ls-files`.split($\)
17
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
18
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
19
+ gem.require_paths = ["lib"]
20
+
21
+ gem.add_dependency 'openurl'
22
+ gem.add_dependency 'activesupport'
23
+ gem.add_dependency 'crfpp'
24
+ gem.add_dependency 'nokogiri'
25
+
26
+ gem.add_development_dependency 'rake'
27
+ gem.add_development_dependency 'rspec'
28
+ gem.add_development_dependency 'pry'
29
+ gem.add_development_dependency 'pry-debugger'
30
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: UTF-8
2
+
3
+ class Array
4
+ def mean
5
+ (size > 0) ? sum.to_f / size : 0
6
+ end
7
+
8
+ def stddev
9
+ m = mean
10
+ devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
11
+ (size > 0) ? (devsum.to_f / size) ** 0.5 : 0
12
+ end
13
+
14
+ def cov(other)
15
+ zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
16
+ end
17
+
18
+ def pearson_r(other)
19
+ unless size == other.size
20
+ raise "Vectors must be of same length to calculate pearson_r"
21
+ end
22
+ devp = stddev * other.stddev
23
+ (devp > 0) ? cov(other) / devp : 0.0
24
+ end
25
+
26
+ end
27
+
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'active_support/core_ext/object'
4
+
5
+ module Excite
6
+
7
+ # parse a string into a citation
8
+ # optionally pass the presumed author
9
+ def self.parse_string(str, author=nil)
10
+ if str.present?
11
+ Citation.new(str, string_parser, author)
12
+ end
13
+ end
14
+
15
+ class << self
16
+ alias_method :parse, :parse_string # for backwards compatibility
17
+ end
18
+
19
+ def self.parse_html(html, author=nil)
20
+ if html.present?
21
+ Citation.new(html, html_parser, author)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def self.string_parser
28
+ Thread.current[:string_crf_parser] ||= CRFParser.new(:string)
29
+ end
30
+
31
+ def self.html_parser
32
+ Thread.current[:html_crf_parser] ||= CRFParser.new(:html)
33
+ end
34
+
35
+ class Citation < Hash
36
+
37
+ attr_accessor :probabilities, :overall_probability
38
+
39
+ def initialize(str, parser, author=nil)
40
+ raw_hash, overall_prob, tag_probs = parser.parse(str, author)
41
+ self.replace(raw_hash.symbolize_keys)
42
+ @probabilities = tag_probs.symbolize_keys
43
+ @overall_probability = overall_prob
44
+ end
45
+
46
+ end
47
+
48
+ end