excite 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
describe Excite do
|
4
|
+
|
5
|
+
context "parse string" do
|
6
|
+
it "handles nil" do
|
7
|
+
Excite.parse_string(nil).should be_nil
|
8
|
+
end
|
9
|
+
|
10
|
+
it "handles empty string" do
|
11
|
+
Excite.parse_string("").should be_nil
|
12
|
+
end
|
13
|
+
|
14
|
+
it "handles string that used to break model" do
|
15
|
+
Excite.parse_string("[if gte mso 10]>\r\n<style>\r\n /* Style Definitions */\r\n table.MsoNormalTable\r\n\t{mso-style-name:\"Table Normal\";\r\n\tmso-tstyle-rowband-size:0;\r\n\tmso-tstyle-colband-size:0;\r\n\tmso-style-noshow:yes;\r\n\tmso-style-priority:99;\r\n\tmso-style-parent:\"\";\r\n\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\r\n\tmso-para-margin:0in;\r\n\tmso-para-margin-bottom:.0001pt;\r\n\tmso-pagination:widow-orphan;\r\n\tfont-size:10.0pt;\r\n\tfont-family:\"Times New Roman\",\"serif\";}\r\n</style>\r\n<![endif]")[:year].should be_nil
|
16
|
+
end
|
17
|
+
|
18
|
+
it "handles non-ASCII unicode characters" do
|
19
|
+
cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology » The history of the future りがと. New York: Pocket Books.")
|
20
|
+
title_should_be(cite, "Star trek chronology » The history of the future")
|
21
|
+
end
|
22
|
+
|
23
|
+
it "handles non-citation string" do
|
24
|
+
Excite.parse_string("Recently while contemplating hosting options for my startup I decided to take a look at Heroku.")[:authors].should be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
it "parses title for APA journal article" do
|
28
|
+
cite = Excite.parse_string("Devine, P. G., & Sherman, S. J. (1992). Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock? Psychological Inquiry, 3(2), 153-159. doi:10.1207/s15327965pli0302_13")
|
29
|
+
title_should_be(cite, "Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "parses title for Turabian journal article" do
|
33
|
+
cite = Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
|
34
|
+
title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
|
35
|
+
end
|
36
|
+
|
37
|
+
it "parses title for Turabian book" do
|
38
|
+
cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology: The history of the future. New York: Pocket Books.")
|
39
|
+
title_should_be(cite, "Star trek chronology: The history of the future")
|
40
|
+
end
|
41
|
+
|
42
|
+
it "parses title for MLA newspaper article" do
|
43
|
+
cite = Excite.parse_string('Di Rado, Alicia. "Trekking through College: Classes Explore Modern Society Using the World of Star Trek." Los Angeles Times 15 Mar. 1995: A3+. Print.')
|
44
|
+
title_should_be(cite, 'Trekking through College: Classes Explore Modern Society Using the World of Star Trek')
|
45
|
+
end
|
46
|
+
|
47
|
+
it "parses title for Chicago journal article" do
|
48
|
+
cite = Excite.parse_string('Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (2): 53-65.')
|
49
|
+
title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
|
50
|
+
end
|
51
|
+
|
52
|
+
it "parses title for journal article with volume" do
|
53
|
+
cite = Excite.parse_string("Watts, S. & Bagnoli, M. (2010). Oligopoly, Disclosure and Earnings Management. The Accounting Review, vol. 85 (4), 1191-1214.")
|
54
|
+
title_should_be(cite, "Oligopoly, Disclosure and Earnings Management")
|
55
|
+
end
|
56
|
+
|
57
|
+
it "parses title for MLA journal article" do
|
58
|
+
cite = Excite.parse_string('Hodges, F. M. "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s." Aging Male 6.3 (2003)')
|
59
|
+
title_should_be(cite, "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s")
|
60
|
+
end
|
61
|
+
|
62
|
+
it "parses title for AMA journal article" do
|
63
|
+
cite = Excite.parse_string("Wilcox RV. Shifting roles and synthetic women in Star trek: The next generation. Stud Pop Culture. 1991;13:53-65.")
|
64
|
+
title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
|
65
|
+
end
|
66
|
+
|
67
|
+
it "parses quoted journal article title" do
|
68
|
+
cite = Excite.parse_string('“Standing in Livestock’s ‘Long Shadow’: The Ethics of Eating Meat on a Small Planet,” Ethics & the Environment 16 (2011): 63-93. (pdf)')
|
69
|
+
title_should_be(cite, 'Standing in Livestock\'s `Long Shadow\': The Ethics of Eating Meat on a Small Planet')
|
70
|
+
end
|
71
|
+
|
72
|
+
it "parses citation prefixed by number" do
|
73
|
+
cite = Excite.parse_string('1. “Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models ” Kamil Wezka ,Philip Salmon, Anita Ziedler, Dean Whittaker, James Drewitt, Stefan Klotz, Harry Fisher and D Marrocchelli, Journal of Physics: Condensed Matter 24 502101 (2012)')
|
74
|
+
title_should_be(cite, 'Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models')
|
75
|
+
end
|
76
|
+
|
77
|
+
it "parses citation prefixed by number without space" do
|
78
|
+
cite = Excite.parse_string("3.“ High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa ” Philip Salmon, James Drewitt, Dean Whittaker, Anita Ziedler, Kamil Wezka, Craig Bull, Mathew Tucker, Martin Wilding, Malcon Guthrie and D Marrocchelli, Journal of Physics: Condensed Matter 24 415102 (2012)")
|
79
|
+
title_should_be(cite, 'High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa')
|
80
|
+
end
|
81
|
+
|
82
|
+
it "parses citation with name not in dict" do
|
83
|
+
cite = Excite.parse_string("John Xkcd, Analyzing Phonetic Variation. Journal of Digital Scholarship\nNov. 2011", "John Xkcd")
|
84
|
+
title_should_be(cite, "Analyzing Phonetic Variation")
|
85
|
+
end
|
86
|
+
|
87
|
+
it "parses citation with parenthetical comment" do
|
88
|
+
cite = Excite.parse_string('The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos (University of Pittsburgh Press 2005). (Awarded the Metaphysical Society of America’s 2007 John N. Findlay Book Prize.)')
|
89
|
+
title_should_be(cite, 'The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos')
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
context "parse html" do
|
95
|
+
|
96
|
+
it "parses cleanly marked-up cite" do
|
97
|
+
cite_str = %{
|
98
|
+
<h3 class="PaperTitle">
|
99
|
+
<span class="AuthorList">Wangyi Liu, Andrea Bertozzi, and Theodore Kolokolnikov,</span>
|
100
|
+
<a class="Title" href="http://www.math.ucla.edu/~bertozzi/papers/CMS-Bobby12-galley.pdf">“Diffuse interface surface tension models in an expanding flow”,</a>
|
101
|
+
<span class="Source">Communications in Mathematical Sciences,</span>
|
102
|
+
<span class="DisplayDate">2012,</span>
|
103
|
+
<span class="Volume">10(1)</span>:<span class="Page">387-418,</span>
|
104
|
+
</h3> }
|
105
|
+
|
106
|
+
cite = Excite.parse_html(cite_str)
|
107
|
+
title_should_be(cite, "Diffuse interface surface tension models in an expanding flow")
|
108
|
+
|
109
|
+
cite[:authors].to_set.should == ["Wangyi Liu", "Andrea Bertozzi", "Theodore Kolokolnikov"].to_set
|
110
|
+
cite[:journal].should == "Communications in Mathematical Sciences"
|
111
|
+
end
|
112
|
+
|
113
|
+
it "parses cite wihout much punctuation" do
|
114
|
+
cite_str = "<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>"
|
115
|
+
|
116
|
+
cite = Excite.parse_html(cite_str)
|
117
|
+
title_should_be(cite, 'Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock')
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
def title_should_be(cite, title)
|
123
|
+
cite[:title].should == title
|
124
|
+
cite.overall_probability.should be_within(0.5).of(0.5)
|
125
|
+
cite.probabilities[:title].should be_within(0.5).of(0.5)
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
|
5
|
+
describe CRFParser do
|
6
|
+
|
7
|
+
before do
|
8
|
+
@parser = CRFParser.new
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "normalize_input_author" do
|
12
|
+
|
13
|
+
it "handles blank" do
|
14
|
+
@parser.normalize_input_author(nil).should be_nil
|
15
|
+
@parser.normalize_input_author('').should be_nil
|
16
|
+
end
|
17
|
+
|
18
|
+
it "handles name with junk punctuation" do
|
19
|
+
res = @parser.normalize_input_author("'Gertjan van Noord'")
|
20
|
+
res.should == ['gertjan', 'van', 'noord']
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "tokenizing" do
|
26
|
+
|
27
|
+
describe "html training data" do
|
28
|
+
TAGGED_HTML = "<author> <li>González-Bailón, S. </author> <date> (2009) </date> <title> <a>Traps on the Web</a>. </title> <journal> Information, Communication & Society </journal> <volume> 12 (8) </volume> <pages> 1149-1173.</li> </pages>"
|
29
|
+
|
30
|
+
it "is labeled correctly" do
|
31
|
+
toks = CRFParser.new(:html).prepare_token_data(TAGGED_HTML, true)
|
32
|
+
|
33
|
+
expected = [
|
34
|
+
['González-Bailón','li','author'],
|
35
|
+
[',','li','author'],
|
36
|
+
['S.','li','author'],
|
37
|
+
['(','li','date'],
|
38
|
+
['2009','li','date'],
|
39
|
+
[')','li', 'date'],
|
40
|
+
['Traps','a','title'],
|
41
|
+
['on','a','title'],
|
42
|
+
['the','a','title'],
|
43
|
+
['Web','a','title'],
|
44
|
+
['.','li','title'],
|
45
|
+
['Information','li','journal'],
|
46
|
+
[',','li','journal'],
|
47
|
+
['Communication','li','journal'],
|
48
|
+
['&','li','journal'],
|
49
|
+
['Society','li','journal'],
|
50
|
+
['12','li','volume'],
|
51
|
+
['(','li','volume'],
|
52
|
+
['8','li','volume'],
|
53
|
+
[')','li','volume'],
|
54
|
+
['1149-1173', 'li', 'pages'],
|
55
|
+
['.','li','pages']
|
56
|
+
]
|
57
|
+
|
58
|
+
toks.length.should == expected.length
|
59
|
+
|
60
|
+
expected.each_with_index do |e, i|
|
61
|
+
t = toks[i]
|
62
|
+
t.raw.should == e[0]
|
63
|
+
t.node.parent.name.should == e[1]
|
64
|
+
t.label.should == e[2]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
describe "html test data" do
|
71
|
+
HTML = "<li><b>Author Name</b> (2012) <a>Paper Title.</a><!-- This is a comment -->Journal Title 15:2 123-234.<span> </span></li>"
|
72
|
+
|
73
|
+
it "is stripped of empty tags and comments" do
|
74
|
+
toks = CRFParser.new(:html).prepare_token_data(HTML)
|
75
|
+
|
76
|
+
toks.each do |tok|
|
77
|
+
tok.node.name.should == 'text'
|
78
|
+
tok.node.parent.name.should_not == 'comment'
|
79
|
+
tok.node.parent.name.should_not == 'span'
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
it "is tokenized correctly" do
|
84
|
+
expected = [
|
85
|
+
['Author','b'],
|
86
|
+
['Name','b'],
|
87
|
+
['(', 'li'],
|
88
|
+
['2012', 'li'],
|
89
|
+
[')','li'],
|
90
|
+
['Paper','a'],
|
91
|
+
['Title','a'],
|
92
|
+
['.','a'],
|
93
|
+
['Journal','li'],
|
94
|
+
['Title','li'],
|
95
|
+
['15','li'],
|
96
|
+
[':2','li'],
|
97
|
+
['123-234','li'],
|
98
|
+
['.','li']
|
99
|
+
]
|
100
|
+
|
101
|
+
toks = CRFParser.new(:html).prepare_token_data(HTML)
|
102
|
+
|
103
|
+
toks.length.should == expected.length
|
104
|
+
|
105
|
+
expected.each_with_index do |e, i|
|
106
|
+
t = toks[i]
|
107
|
+
t.raw.should == e[0]
|
108
|
+
t.node.parent.name.should == e[1]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'excite/postprocessor'
|
4
|
+
|
5
|
+
module Excite
|
6
|
+
|
7
|
+
describe Postprocessor do
|
8
|
+
|
9
|
+
describe "normalize_title" do
|
10
|
+
|
11
|
+
it "strips whitespace" do
|
12
|
+
normalize(' a title .').should == 'a title'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "strips punctuation" do
|
16
|
+
normalize('(a title) ').should == 'a title'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "strips leading numerals" do
|
20
|
+
normalize('1. A title').should == 'A title'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "doesn't strip numerals part of the title" do
|
24
|
+
normalize('1 is the best number of titles').should == '1 is the best number of titles'
|
25
|
+
end
|
26
|
+
|
27
|
+
it "strips leading roman numerals" do
|
28
|
+
normalize('xiv. A title').should == 'A title'
|
29
|
+
end
|
30
|
+
|
31
|
+
it "doesn't strip roman numeral-like title starts" do
|
32
|
+
normalize('IVs are needles not titles').should == 'IVs are needles not titles'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "strips leading enumerating letters" do
|
36
|
+
normalize('A. My title').should == 'My title'
|
37
|
+
end
|
38
|
+
|
39
|
+
it "doesn't strip leading single letters" do
|
40
|
+
normalize('A title').should == 'A title'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "extracts title from between quotes" do
|
44
|
+
normalize('"A title" which is cool').should == 'A title'
|
45
|
+
end
|
46
|
+
|
47
|
+
it "doesn't reduce title to quote part" do
|
48
|
+
normalize('This title comments on "some other title": a crappy work').should == 'This title comments on "some other title": a crappy work'
|
49
|
+
end
|
50
|
+
|
51
|
+
it "chops content after a newline" do
|
52
|
+
normalize("A title\nActually an author or journal").should == 'A title'
|
53
|
+
end
|
54
|
+
|
55
|
+
it "doesn't chop content after a newline if there's not enough before the newline" do
|
56
|
+
normalize("A\ntitle mostly after the newline").should == "A\ntitle mostly after the newline"
|
57
|
+
end
|
58
|
+
|
59
|
+
def normalize(title)
|
60
|
+
hsh = { "title" => title }
|
61
|
+
CRFParser.new.normalize_title(hsh)
|
62
|
+
hsh["title"]
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|