excite 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--require spec_helper --color
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
GIT
|
2
|
+
remote: git://github.com/academia-edu/engtagger.git
|
3
|
+
revision: 89af3ad6d8299f80e25dad64a4fd49e6a991abaf
|
4
|
+
specs:
|
5
|
+
engtagger (0.1.2)
|
6
|
+
|
7
|
+
PATH
|
8
|
+
remote: .
|
9
|
+
specs:
|
10
|
+
excite (2.1.0)
|
11
|
+
activesupport
|
12
|
+
crfpp
|
13
|
+
nokogiri
|
14
|
+
openurl
|
15
|
+
|
16
|
+
GEM
|
17
|
+
remote: https://rubygems.org/
|
18
|
+
specs:
|
19
|
+
activesupport (3.2.12)
|
20
|
+
i18n (~> 0.6)
|
21
|
+
multi_json (~> 1.0)
|
22
|
+
coderay (1.0.8)
|
23
|
+
columnize (0.3.6)
|
24
|
+
crfpp (0.0.4)
|
25
|
+
debugger (1.2.3)
|
26
|
+
columnize (>= 0.3.1)
|
27
|
+
debugger-linecache (~> 1.1.1)
|
28
|
+
debugger-ruby_core_source (~> 1.1.5)
|
29
|
+
debugger-linecache (1.1.2)
|
30
|
+
debugger-ruby_core_source (>= 1.1.1)
|
31
|
+
debugger-ruby_core_source (1.1.6)
|
32
|
+
diff-lcs (1.1.3)
|
33
|
+
ensure_valid_encoding (0.5.3)
|
34
|
+
i18n (0.6.4)
|
35
|
+
marc (0.5.0)
|
36
|
+
method_source (0.8.1)
|
37
|
+
multi_json (1.6.1)
|
38
|
+
nokogiri (1.5.6)
|
39
|
+
openurl (0.4.2)
|
40
|
+
ensure_valid_encoding
|
41
|
+
marc
|
42
|
+
pry (0.9.10)
|
43
|
+
coderay (~> 1.0.5)
|
44
|
+
method_source (~> 0.8)
|
45
|
+
slop (~> 3.3.1)
|
46
|
+
pry-debugger (0.2.1)
|
47
|
+
debugger (~> 1.2.0)
|
48
|
+
pry (~> 0.9.10)
|
49
|
+
rake (10.0.3)
|
50
|
+
rspec (2.12.0)
|
51
|
+
rspec-core (~> 2.12.0)
|
52
|
+
rspec-expectations (~> 2.12.0)
|
53
|
+
rspec-mocks (~> 2.12.0)
|
54
|
+
rspec-core (2.12.2)
|
55
|
+
rspec-expectations (2.12.1)
|
56
|
+
diff-lcs (~> 1.1.3)
|
57
|
+
rspec-mocks (2.12.1)
|
58
|
+
slop (3.3.3)
|
59
|
+
|
60
|
+
PLATFORMS
|
61
|
+
ruby
|
62
|
+
|
63
|
+
DEPENDENCIES
|
64
|
+
engtagger!
|
65
|
+
excite!
|
66
|
+
pry
|
67
|
+
pry-debugger
|
68
|
+
rake
|
69
|
+
rspec
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Public Display Inc
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
22
|
+
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Excite
|
2
|
+
|
3
|
+
Provides a simple Ruby API for parsing citations from plain text strings or HTML.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'excite'
|
9
|
+
|
10
|
+
Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
|
11
|
+
|
12
|
+
Excite.parse_html("<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>")
|
13
|
+
```
|
14
|
+
|
15
|
+
## History and Credits
|
16
|
+
|
17
|
+
Derived from [FreeCite](http://freecite.library.brown.edu/), minus Rails and all UI elements. The most up-to-date fork of FreeCite of which I am aware is [rsinger's](https://github.com/rsinger/free_cite). FreeCite in turn is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/).
|
18
|
+
|
19
|
+
The main changes are:
|
20
|
+
* No UI, just a gem;
|
21
|
+
* New model for parsing HTML;
|
22
|
+
* Tokenization and part-of-speech features from [EngTagger](https://github.com/yohasebe/engtagger).
|
23
|
+
|
24
|
+
Credit is due to the authors of all the linked projects, as well as Laura Durkay who marked up the HTML training data.
|
25
|
+
|
26
|
+
## Install required packages
|
27
|
+
|
28
|
+
### From source
|
29
|
+
|
30
|
+
wget http://crfpp.googlecode.com/files/CRF%2B%2B-0.57.tar.gz
|
31
|
+
tar xvzf CRF++-0.57.tar.gz
|
32
|
+
cd CRF++-0.57
|
33
|
+
./configure
|
34
|
+
make
|
35
|
+
sudo make install
|
36
|
+
|
37
|
+
### On Ubuntu
|
38
|
+
|
39
|
+
sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
|
40
|
+
sudo apt-get update
|
41
|
+
sudo apt-get install libcrf++
|
42
|
+
|
43
|
+
### On OS X with Homebrew
|
44
|
+
|
45
|
+
brew install crf++
|
46
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'bundler/gem_tasks'
|
5
|
+
require 'rspec/core/rake_task'
|
6
|
+
require 'excite'
|
7
|
+
|
8
|
+
RSpec::Core::RakeTask.new :spec
|
9
|
+
|
10
|
+
namespace :crfparser do
|
11
|
+
desc 'train a CRF model for the citation parser'
|
12
|
+
task :train_model, :type do |_, args|
|
13
|
+
mode = args[:type] ? args[:type].to_sym : :string
|
14
|
+
Excite::CRFParser.new(mode).train
|
15
|
+
end
|
16
|
+
|
17
|
+
desc 'test a CRF model with the current training set & features'
|
18
|
+
task :test_model, :type do |_, args|
|
19
|
+
require "#{File.dirname(__FILE__)}/model/test/model_test"
|
20
|
+
mode = args[:type] ? args[:type].to_sym : :string
|
21
|
+
Excite::ModelTest.new(mode).run_test
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# The rules consist of two parts: the rules, which is a hash of regular expression rules and an array to declare what order the rules should be run in.
|
2
|
+
# The keys of each rules hash are:
|
3
|
+
# regex (required): a string representation of the regular expression rule
|
4
|
+
# ignore_case (optional): a boolean to define whether or not the regex is case insensitive (the "i" flag)
|
5
|
+
# replacement_str (optional): the replacement string. Absence of this attribute assumes "" (empty string)
|
6
|
+
|
7
|
+
|
8
|
+
order:
|
9
|
+
- sub_quotes_0
|
10
|
+
- sub_quotes_1
|
11
|
+
- strip_quotes_0
|
12
|
+
|
13
|
+
# TODO Figure out which rules might be helpful, and which are dumber versions of model features
|
14
|
+
rules:
|
15
|
+
vol_0:
|
16
|
+
# Description: be sure a whitespace exists between string 'vol.' and the actual volume number
|
17
|
+
regex: \svol\.([A-z0-9]*)
|
18
|
+
ignore_case: true
|
19
|
+
replacement_str: ' vol. \1'
|
20
|
+
vol_1:
|
21
|
+
# Description: replace 'Vol' with 'vol.'
|
22
|
+
regex: \sVol\W
|
23
|
+
replacement_str: ' vol. '
|
24
|
+
vol_2:
|
25
|
+
# Description: replace 'Volume' with 'vol.' (lowercased 'volume' seems fine)
|
26
|
+
regex: \sVolume\W
|
27
|
+
replacement_str: ' vol. '
|
28
|
+
year_0:
|
29
|
+
# Description: Years generally appear in the form (YYYY), remove any variants ((YYYY/YY), etc.)
|
30
|
+
regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
|
31
|
+
replacement_str: ' (\1) '
|
32
|
+
number_range_0:
|
33
|
+
# Description: Ranges (generally pages) can't have spaces between the hyphen
|
34
|
+
regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
|
35
|
+
replacement_str: ' (\1) '
|
36
|
+
vol_iss_0:
|
37
|
+
# Description: Volume and issue sometimes appear as '123, (4)' or '123 (4)': rewrite to '123:4'
|
38
|
+
regex: \s([0-9]{1,3}),?\s\(([0-9]+)\)
|
39
|
+
replacement_str: ' \1:\2'
|
40
|
+
iss_0:
|
41
|
+
# Description: 'Issue' needs to be lowercased
|
42
|
+
regex: \sIssue\b
|
43
|
+
replacement_str: ' issue'
|
44
|
+
eresource_0:
|
45
|
+
# Description: remove the junk about it being an electronic copy
|
46
|
+
regex: \s?\[electronic (journal|resource|version)\]
|
47
|
+
ignore_case: true
|
48
|
+
eresource_1:
|
49
|
+
# Description: variant cruft about it being an electronic copy
|
50
|
+
regex: \s?\[online\]
|
51
|
+
ignore_case: true
|
52
|
+
leading_source_0:
|
53
|
+
# Description: Many citations lead with the database they were found in: 'JSTOR', 'EBSCO', etc. These need to be removed
|
54
|
+
regex: ^(JSTOR|Blackwell Synergy)\W*
|
55
|
+
ignore_case: true
|
56
|
+
sub_quotes_0:
|
57
|
+
# Replace non-ASCII double quotation marks
|
58
|
+
regex: "[“”‟„"〝〞〟]"
|
59
|
+
replacement_str: '"'
|
60
|
+
sub_quotes_1:
|
61
|
+
# Replace non-ASCII single quotation marks
|
62
|
+
regex: "[‘’‛‚'`]"
|
63
|
+
replacement_str: "'"
|
64
|
+
strip_quotes_0:
|
65
|
+
# Remove leading and trailing quotes from citation string
|
66
|
+
regex: ^\s*\"(.*)\"\s*$
|
67
|
+
replacement_str: \1
|
68
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Specify feature print-order here, and refer to the associated numbers
|
2
|
+
# when writing the template file
|
3
|
+
string:
|
4
|
+
feature_order:
|
5
|
+
- 'last_char' # 1
|
6
|
+
- 'first_1_char' # 2
|
7
|
+
- 'first_2_chars' # 3
|
8
|
+
- 'first_3_chars' # 4
|
9
|
+
- 'first_4_chars' # 5
|
10
|
+
- 'last_1_char' # 6
|
11
|
+
- 'last_2_chars' # 7
|
12
|
+
- 'last_3_chars' # 8
|
13
|
+
- 'last_4_chars' # 9
|
14
|
+
- 'toklcnp' # 10
|
15
|
+
- 'capitalization' # 11
|
16
|
+
- 'numbers' # 12
|
17
|
+
- 'a_is_in_dict' # 13
|
18
|
+
- 'firstName' # 14
|
19
|
+
- 'lastName' # 15
|
20
|
+
- 'monthName' # 16
|
21
|
+
- 'placeName' # 17
|
22
|
+
- 'publisherName' # 18
|
23
|
+
- 'possible_editor' # 19
|
24
|
+
- 'location' # 20
|
25
|
+
- 'punct' # 21
|
26
|
+
- 'is_in' # 22
|
27
|
+
- 'part_of_speech' # 23
|
28
|
+
|
29
|
+
html:
|
30
|
+
feature_order:
|
31
|
+
- 'last_char' # 1
|
32
|
+
- 'first_1_char' # 2
|
33
|
+
- 'first_2_chars' # 3
|
34
|
+
- 'first_3_chars' # 4
|
35
|
+
- 'first_4_chars' # 5
|
36
|
+
- 'last_1_char' # 6
|
37
|
+
- 'last_2_chars' # 7
|
38
|
+
- 'last_3_chars' # 8
|
39
|
+
- 'last_4_chars' # 9
|
40
|
+
- 'toklcnp' # 10
|
41
|
+
- 'capitalization' # 11
|
42
|
+
- 'numbers' # 12
|
43
|
+
- 'a_is_in_dict' # 13
|
44
|
+
- 'firstName' # 14
|
45
|
+
- 'lastName' # 15
|
46
|
+
- 'monthName' # 16
|
47
|
+
- 'placeName' # 17
|
48
|
+
- 'publisherName' # 18
|
49
|
+
- 'possible_editor' # 19
|
50
|
+
- 'location' # 20
|
51
|
+
- 'punct' # 21
|
52
|
+
- 'is_in' # 22
|
53
|
+
- 'tag_name' # 23
|
54
|
+
- 'location_in_node' # 24
|
55
|
+
- 'part_of_speech' # 25
|
data/excite.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'excite/version'
|
7
|
+
|
8
|
+
Gem::Specification.new do |gem|
|
9
|
+
gem.name = "excite"
|
10
|
+
gem.version = Excite::VERSION
|
11
|
+
gem.authors = ["David Judd"]
|
12
|
+
gem.email = ["david@academia.edu"]
|
13
|
+
gem.summary = %q{Parse citations}
|
14
|
+
gem.homepage = "http://github.com/academia-edu/free_cite"
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($\)
|
17
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
18
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
19
|
+
gem.require_paths = ["lib"]
|
20
|
+
|
21
|
+
gem.add_dependency 'openurl'
|
22
|
+
gem.add_dependency 'activesupport'
|
23
|
+
gem.add_dependency 'crfpp'
|
24
|
+
gem.add_dependency 'nokogiri'
|
25
|
+
|
26
|
+
gem.add_development_dependency 'rake'
|
27
|
+
gem.add_development_dependency 'rspec'
|
28
|
+
gem.add_development_dependency 'pry'
|
29
|
+
gem.add_development_dependency 'pry-debugger'
|
30
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class Array
|
4
|
+
def mean
|
5
|
+
(size > 0) ? sum.to_f / size : 0
|
6
|
+
end
|
7
|
+
|
8
|
+
def stddev
|
9
|
+
m = mean
|
10
|
+
devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
|
11
|
+
(size > 0) ? (devsum.to_f / size) ** 0.5 : 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def cov(other)
|
15
|
+
zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
|
16
|
+
end
|
17
|
+
|
18
|
+
def pearson_r(other)
|
19
|
+
unless size == other.size
|
20
|
+
raise "Vectors must be of same length to calculate pearson_r"
|
21
|
+
end
|
22
|
+
devp = stddev * other.stddev
|
23
|
+
(devp > 0) ? cov(other) / devp : 0.0
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
|
5
|
+
module Excite
|
6
|
+
|
7
|
+
# parse a string into a citation
|
8
|
+
# optionally pass the presumed author
|
9
|
+
def self.parse_string(str, author=nil)
|
10
|
+
if str.present?
|
11
|
+
Citation.new(str, string_parser, author)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
alias_method :parse, :parse_string # for backwards compatibility
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse_html(html, author=nil)
|
20
|
+
if html.present?
|
21
|
+
Citation.new(html, html_parser, author)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def self.string_parser
|
28
|
+
Thread.current[:string_crf_parser] ||= CRFParser.new(:string)
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.html_parser
|
32
|
+
Thread.current[:html_crf_parser] ||= CRFParser.new(:html)
|
33
|
+
end
|
34
|
+
|
35
|
+
class Citation < Hash
|
36
|
+
|
37
|
+
attr_accessor :probabilities, :overall_probability
|
38
|
+
|
39
|
+
def initialize(str, parser, author=nil)
|
40
|
+
raw_hash, overall_prob, tag_probs = parser.parse(str, author)
|
41
|
+
self.replace(raw_hash.symbolize_keys)
|
42
|
+
@probabilities = tag_probs.symbolize_keys
|
43
|
+
@overall_probability = overall_prob
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|