excite 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--require spec_helper --color
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
GIT
|
2
|
+
remote: git://github.com/academia-edu/engtagger.git
|
3
|
+
revision: 89af3ad6d8299f80e25dad64a4fd49e6a991abaf
|
4
|
+
specs:
|
5
|
+
engtagger (0.1.2)
|
6
|
+
|
7
|
+
PATH
|
8
|
+
remote: .
|
9
|
+
specs:
|
10
|
+
excite (2.1.0)
|
11
|
+
activesupport
|
12
|
+
crfpp
|
13
|
+
nokogiri
|
14
|
+
openurl
|
15
|
+
|
16
|
+
GEM
|
17
|
+
remote: https://rubygems.org/
|
18
|
+
specs:
|
19
|
+
activesupport (3.2.12)
|
20
|
+
i18n (~> 0.6)
|
21
|
+
multi_json (~> 1.0)
|
22
|
+
coderay (1.0.8)
|
23
|
+
columnize (0.3.6)
|
24
|
+
crfpp (0.0.4)
|
25
|
+
debugger (1.2.3)
|
26
|
+
columnize (>= 0.3.1)
|
27
|
+
debugger-linecache (~> 1.1.1)
|
28
|
+
debugger-ruby_core_source (~> 1.1.5)
|
29
|
+
debugger-linecache (1.1.2)
|
30
|
+
debugger-ruby_core_source (>= 1.1.1)
|
31
|
+
debugger-ruby_core_source (1.1.6)
|
32
|
+
diff-lcs (1.1.3)
|
33
|
+
ensure_valid_encoding (0.5.3)
|
34
|
+
i18n (0.6.4)
|
35
|
+
marc (0.5.0)
|
36
|
+
method_source (0.8.1)
|
37
|
+
multi_json (1.6.1)
|
38
|
+
nokogiri (1.5.6)
|
39
|
+
openurl (0.4.2)
|
40
|
+
ensure_valid_encoding
|
41
|
+
marc
|
42
|
+
pry (0.9.10)
|
43
|
+
coderay (~> 1.0.5)
|
44
|
+
method_source (~> 0.8)
|
45
|
+
slop (~> 3.3.1)
|
46
|
+
pry-debugger (0.2.1)
|
47
|
+
debugger (~> 1.2.0)
|
48
|
+
pry (~> 0.9.10)
|
49
|
+
rake (10.0.3)
|
50
|
+
rspec (2.12.0)
|
51
|
+
rspec-core (~> 2.12.0)
|
52
|
+
rspec-expectations (~> 2.12.0)
|
53
|
+
rspec-mocks (~> 2.12.0)
|
54
|
+
rspec-core (2.12.2)
|
55
|
+
rspec-expectations (2.12.1)
|
56
|
+
diff-lcs (~> 1.1.3)
|
57
|
+
rspec-mocks (2.12.1)
|
58
|
+
slop (3.3.3)
|
59
|
+
|
60
|
+
PLATFORMS
|
61
|
+
ruby
|
62
|
+
|
63
|
+
DEPENDENCIES
|
64
|
+
engtagger!
|
65
|
+
excite!
|
66
|
+
pry
|
67
|
+
pry-debugger
|
68
|
+
rake
|
69
|
+
rspec
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Public Display Inc
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
22
|
+
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Excite
|
2
|
+
|
3
|
+
Provides a simple Ruby API for parsing citations from plain text strings or HTML.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'excite'
|
9
|
+
|
10
|
+
Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
|
11
|
+
|
12
|
+
Excite.parse_html("<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>")
|
13
|
+
```
|
14
|
+
|
15
|
+
## History and Credits
|
16
|
+
|
17
|
+
Derived from [FreeCite](http://freecite.library.brown.edu/), minus Rails and all UI elements. The most up-to-date fork of FreeCite of which I am aware is [rsinger's](https://github.com/rsinger/free_cite). FreeCite in turn is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/).
|
18
|
+
|
19
|
+
The main changes are:
|
20
|
+
* No UI, just a gem;
|
21
|
+
* New model for parsing HTML;
|
22
|
+
* Tokenization and part-of-speech features from [EngTagger](https://github.com/yohasebe/engtagger).
|
23
|
+
|
24
|
+
Credit is due to the authors of all the linked projects, as well as Laura Durkay who marked up the HTML training data.
|
25
|
+
|
26
|
+
## Install required packages
|
27
|
+
|
28
|
+
### From source
|
29
|
+
|
30
|
+
wget http://crfpp.googlecode.com/files/CRF%2B%2B-0.57.tar.gz
|
31
|
+
tar xvzf CRF++-0.57.tar.gz
|
32
|
+
cd CRF++-0.57
|
33
|
+
./configure
|
34
|
+
make
|
35
|
+
sudo make install
|
36
|
+
|
37
|
+
### On Ubuntu
|
38
|
+
|
39
|
+
sudo apt-add-repository 'deb http://cl.naist.jp/~eric-n/ubuntu-nlp oneiric all'
|
40
|
+
sudo apt-get update
|
41
|
+
sudo apt-get install libcrf++
|
42
|
+
|
43
|
+
### On OS X with Homebrew
|
44
|
+
|
45
|
+
brew install crf++
|
46
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'bundler/gem_tasks'
|
5
|
+
require 'rspec/core/rake_task'
|
6
|
+
require 'excite'
|
7
|
+
|
8
|
+
RSpec::Core::RakeTask.new :spec
|
9
|
+
|
10
|
+
namespace :crfparser do
|
11
|
+
desc 'train a CRF model for the citation parser'
|
12
|
+
task :train_model, :type do |_, args|
|
13
|
+
mode = args[:type] ? args[:type].to_sym : :string
|
14
|
+
Excite::CRFParser.new(mode).train
|
15
|
+
end
|
16
|
+
|
17
|
+
desc 'test a CRF model with the current training set & features'
|
18
|
+
task :test_model, :type do |_, args|
|
19
|
+
require "#{File.dirname(__FILE__)}/model/test/model_test"
|
20
|
+
mode = args[:type] ? args[:type].to_sym : :string
|
21
|
+
Excite::ModelTest.new(mode).run_test
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# The rules consist of two parts: the rules, which is a hash of regular expression rules and an array to declare what order the rules should be run in.
|
2
|
+
# The keys of each rules hash are:
|
3
|
+
# regex (required): a string representation of the regular expression rule
|
4
|
+
# ignore_case (optional): a boolean to define whether or not the regex is case insensitive (the "i" flag)
|
5
|
+
# replacement_str (optional): the replacement string. Absence of this attribute assumes "" (empty string)
|
6
|
+
|
7
|
+
|
8
|
+
order:
|
9
|
+
- sub_quotes_0
|
10
|
+
- sub_quotes_1
|
11
|
+
- strip_quotes_0
|
12
|
+
|
13
|
+
# TODO Figure out which rules might be helpful, and which are dumber versions of model features
|
14
|
+
rules:
|
15
|
+
vol_0:
|
16
|
+
# Description: be sure a whitespace exists between string 'vol.' and the actual volume number
|
17
|
+
regex: \svol\.([A-z0-9]*)
|
18
|
+
ignore_case: true
|
19
|
+
replacement_str: ' vol. \1'
|
20
|
+
vol_1:
|
21
|
+
# Description: replace 'Vol' with 'vol.'
|
22
|
+
regex: \sVol\W
|
23
|
+
replacement_str: ' vol. '
|
24
|
+
vol_2:
|
25
|
+
# Description: replace 'Volume' with 'vol.' (lowercased 'volume' seems fine)
|
26
|
+
regex: \sVolume\W
|
27
|
+
replacement_str: ' vol. '
|
28
|
+
year_0:
|
29
|
+
# Description: Years generally appear in the form (YYYY), remove any variants ((YYYY/YY), etc.)
|
30
|
+
regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
|
31
|
+
replacement_str: ' (\1) '
|
32
|
+
number_range_0:
|
33
|
+
# Description: Ranges (generally pages) can't have spaces between the hyphen
|
34
|
+
regex: \s\(([0-9]{4})\/?[0-9]*\)\.?\s
|
35
|
+
replacement_str: ' (\1) '
|
36
|
+
vol_iss_0:
|
37
|
+
# Description: Volume and issue sometimes appear as '123, (4)' or '123 (4)': rewrite to '123:4'
|
38
|
+
regex: \s([0-9]{1,3}),?\s\(([0-9]+)\)
|
39
|
+
replacement_str: ' \1:\2'
|
40
|
+
iss_0:
|
41
|
+
# Description: 'Issue' needs to be lowercased
|
42
|
+
regex: \sIssue\b
|
43
|
+
replacement_str: ' issue'
|
44
|
+
eresource_0:
|
45
|
+
# Description: remove the junk about it being an electronic copy
|
46
|
+
regex: \s?\[electronic (journal|resource|version)\]
|
47
|
+
ignore_case: true
|
48
|
+
eresource_1:
|
49
|
+
# Description: variant cruft about it being an electronic copy
|
50
|
+
regex: \s?\[online\]
|
51
|
+
ignore_case: true
|
52
|
+
leading_source_0:
|
53
|
+
# Description: Many citations lead with the database they were found in: 'JSTOR', 'EBSCO', etc. These need to be removed
|
54
|
+
regex: ^(JSTOR|Blackwell Synergy)\W*
|
55
|
+
ignore_case: true
|
56
|
+
sub_quotes_0:
|
57
|
+
# Replace non-ASCII double quotation marks
|
58
|
+
regex: "[“”‟„"〝〞〟]"
|
59
|
+
replacement_str: '"'
|
60
|
+
sub_quotes_1:
|
61
|
+
# Replace non-ASCII single quotation marks
|
62
|
+
regex: "[‘’‛‚'`]"
|
63
|
+
replacement_str: "'"
|
64
|
+
strip_quotes_0:
|
65
|
+
# Remove leading and trailing quotes from citation string
|
66
|
+
regex: ^\s*\"(.*)\"\s*$
|
67
|
+
replacement_str: \1
|
68
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Specify feature print-order here, and refer to the associated numbers
|
2
|
+
# when writing the template file
|
3
|
+
string:
|
4
|
+
feature_order:
|
5
|
+
- 'last_char' # 1
|
6
|
+
- 'first_1_char' # 2
|
7
|
+
- 'first_2_chars' # 3
|
8
|
+
- 'first_3_chars' # 4
|
9
|
+
- 'first_4_chars' # 5
|
10
|
+
- 'last_1_char' # 6
|
11
|
+
- 'last_2_chars' # 7
|
12
|
+
- 'last_3_chars' # 8
|
13
|
+
- 'last_4_chars' # 9
|
14
|
+
- 'toklcnp' # 10
|
15
|
+
- 'capitalization' # 11
|
16
|
+
- 'numbers' # 12
|
17
|
+
- 'a_is_in_dict' # 13
|
18
|
+
- 'firstName' # 14
|
19
|
+
- 'lastName' # 15
|
20
|
+
- 'monthName' # 16
|
21
|
+
- 'placeName' # 17
|
22
|
+
- 'publisherName' # 18
|
23
|
+
- 'possible_editor' # 19
|
24
|
+
- 'location' # 20
|
25
|
+
- 'punct' # 21
|
26
|
+
- 'is_in' # 22
|
27
|
+
- 'part_of_speech' # 23
|
28
|
+
|
29
|
+
html:
|
30
|
+
feature_order:
|
31
|
+
- 'last_char' # 1
|
32
|
+
- 'first_1_char' # 2
|
33
|
+
- 'first_2_chars' # 3
|
34
|
+
- 'first_3_chars' # 4
|
35
|
+
- 'first_4_chars' # 5
|
36
|
+
- 'last_1_char' # 6
|
37
|
+
- 'last_2_chars' # 7
|
38
|
+
- 'last_3_chars' # 8
|
39
|
+
- 'last_4_chars' # 9
|
40
|
+
- 'toklcnp' # 10
|
41
|
+
- 'capitalization' # 11
|
42
|
+
- 'numbers' # 12
|
43
|
+
- 'a_is_in_dict' # 13
|
44
|
+
- 'firstName' # 14
|
45
|
+
- 'lastName' # 15
|
46
|
+
- 'monthName' # 16
|
47
|
+
- 'placeName' # 17
|
48
|
+
- 'publisherName' # 18
|
49
|
+
- 'possible_editor' # 19
|
50
|
+
- 'location' # 20
|
51
|
+
- 'punct' # 21
|
52
|
+
- 'is_in' # 22
|
53
|
+
- 'tag_name' # 23
|
54
|
+
- 'location_in_node' # 24
|
55
|
+
- 'part_of_speech' # 25
|
data/excite.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'excite/version'
|
7
|
+
|
8
|
+
Gem::Specification.new do |gem|
|
9
|
+
gem.name = "excite"
|
10
|
+
gem.version = Excite::VERSION
|
11
|
+
gem.authors = ["David Judd"]
|
12
|
+
gem.email = ["david@academia.edu"]
|
13
|
+
gem.summary = %q{Parse citations}
|
14
|
+
gem.homepage = "http://github.com/academia-edu/free_cite"
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($\)
|
17
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
18
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
19
|
+
gem.require_paths = ["lib"]
|
20
|
+
|
21
|
+
gem.add_dependency 'openurl'
|
22
|
+
gem.add_dependency 'activesupport'
|
23
|
+
gem.add_dependency 'crfpp'
|
24
|
+
gem.add_dependency 'nokogiri'
|
25
|
+
|
26
|
+
gem.add_development_dependency 'rake'
|
27
|
+
gem.add_development_dependency 'rspec'
|
28
|
+
gem.add_development_dependency 'pry'
|
29
|
+
gem.add_development_dependency 'pry-debugger'
|
30
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class Array
|
4
|
+
def mean
|
5
|
+
(size > 0) ? sum.to_f / size : 0
|
6
|
+
end
|
7
|
+
|
8
|
+
def stddev
|
9
|
+
m = mean
|
10
|
+
devsum = inject( 0 ) { |ds,x| ds += (x - m)**2 }
|
11
|
+
(size > 0) ? (devsum.to_f / size) ** 0.5 : 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def cov(other)
|
15
|
+
zip(other).map {|a,b| a*b }.mean - (mean * other.mean)
|
16
|
+
end
|
17
|
+
|
18
|
+
def pearson_r(other)
|
19
|
+
unless size == other.size
|
20
|
+
raise "Vectors must be of same length to calculate pearson_r"
|
21
|
+
end
|
22
|
+
devp = stddev * other.stddev
|
23
|
+
(devp > 0) ? cov(other) / devp : 0.0
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'active_support/core_ext/object'
|
4
|
+
|
5
|
+
module Excite
|
6
|
+
|
7
|
+
# parse a string into a citation
|
8
|
+
# optionally pass the presumed author
|
9
|
+
def self.parse_string(str, author=nil)
|
10
|
+
if str.present?
|
11
|
+
Citation.new(str, string_parser, author)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
alias_method :parse, :parse_string # for backwards compatibility
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse_html(html, author=nil)
|
20
|
+
if html.present?
|
21
|
+
Citation.new(html, html_parser, author)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def self.string_parser
|
28
|
+
Thread.current[:string_crf_parser] ||= CRFParser.new(:string)
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.html_parser
|
32
|
+
Thread.current[:html_crf_parser] ||= CRFParser.new(:html)
|
33
|
+
end
|
34
|
+
|
35
|
+
class Citation < Hash
|
36
|
+
|
37
|
+
attr_accessor :probabilities, :overall_probability
|
38
|
+
|
39
|
+
def initialize(str, parser, author=nil)
|
40
|
+
raw_hash, overall_prob, tag_probs = parser.parse(str, author)
|
41
|
+
self.replace(raw_hash.symbolize_keys)
|
42
|
+
@probabilities = tag_probs.symbolize_keys
|
43
|
+
@overall_probability = overall_prob
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|