excite 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
describe Excite do
|
4
|
+
|
5
|
+
context "parse string" do
|
6
|
+
it "handles nil" do
|
7
|
+
Excite.parse_string(nil).should be_nil
|
8
|
+
end
|
9
|
+
|
10
|
+
it "handles empty string" do
|
11
|
+
Excite.parse_string("").should be_nil
|
12
|
+
end
|
13
|
+
|
14
|
+
it "handles string that used to break model" do
|
15
|
+
Excite.parse_string("[if gte mso 10]>\r\n<style>\r\n /* Style Definitions */\r\n table.MsoNormalTable\r\n\t{mso-style-name:\"Table Normal\";\r\n\tmso-tstyle-rowband-size:0;\r\n\tmso-tstyle-colband-size:0;\r\n\tmso-style-noshow:yes;\r\n\tmso-style-priority:99;\r\n\tmso-style-parent:\"\";\r\n\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\r\n\tmso-para-margin:0in;\r\n\tmso-para-margin-bottom:.0001pt;\r\n\tmso-pagination:widow-orphan;\r\n\tfont-size:10.0pt;\r\n\tfont-family:\"Times New Roman\",\"serif\";}\r\n</style>\r\n<![endif]")[:year].should be_nil
|
16
|
+
end
|
17
|
+
|
18
|
+
it "handles non-ASCII unicode characters" do
|
19
|
+
cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology » The history of the future りがと. New York: Pocket Books.")
|
20
|
+
title_should_be(cite, "Star trek chronology » The history of the future")
|
21
|
+
end
|
22
|
+
|
23
|
+
it "handles non-citation string" do
|
24
|
+
Excite.parse_string("Recently while contemplating hosting options for my startup I decided to take a look at Heroku.")[:authors].should be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
it "parses title for APA journal article" do
|
28
|
+
cite = Excite.parse_string("Devine, P. G., & Sherman, S. J. (1992). Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock? Psychological Inquiry, 3(2), 153-159. doi:10.1207/s15327965pli0302_13")
|
29
|
+
title_should_be(cite, "Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "parses title for Turabian journal article" do
|
33
|
+
cite = Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
|
34
|
+
title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
|
35
|
+
end
|
36
|
+
|
37
|
+
it "parses title for Turabian book" do
|
38
|
+
cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology: The history of the future. New York: Pocket Books.")
|
39
|
+
title_should_be(cite, "Star trek chronology: The history of the future")
|
40
|
+
end
|
41
|
+
|
42
|
+
it "parses title for MLA newspaper article" do
|
43
|
+
cite = Excite.parse_string('Di Rado, Alicia. "Trekking through College: Classes Explore Modern Society Using the World of Star Trek." Los Angeles Times 15 Mar. 1995: A3+. Print.')
|
44
|
+
title_should_be(cite, 'Trekking through College: Classes Explore Modern Society Using the World of Star Trek')
|
45
|
+
end
|
46
|
+
|
47
|
+
it "parses title for Chicago journal article" do
|
48
|
+
cite = Excite.parse_string('Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (2): 53-65.')
|
49
|
+
title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
|
50
|
+
end
|
51
|
+
|
52
|
+
it "parses title for journal article with volume" do
|
53
|
+
cite = Excite.parse_string("Watts, S. & Bagnoli, M. (2010). Oligopoly, Disclosure and Earnings Management. The Accounting Review, vol. 85 (4), 1191-1214.")
|
54
|
+
title_should_be(cite, "Oligopoly, Disclosure and Earnings Management")
|
55
|
+
end
|
56
|
+
|
57
|
+
it "parses title for MLA journal article" do
|
58
|
+
cite = Excite.parse_string('Hodges, F. M. "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s." Aging Male 6.3 (2003)')
|
59
|
+
title_should_be(cite, "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s")
|
60
|
+
end
|
61
|
+
|
62
|
+
it "parses title for AMA journal article" do
|
63
|
+
cite = Excite.parse_string("Wilcox RV. Shifting roles and synthetic women in Star trek: The next generation. Stud Pop Culture. 1991;13:53-65.")
|
64
|
+
title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
|
65
|
+
end
|
66
|
+
|
67
|
+
it "parses quoted journal article title" do
|
68
|
+
cite = Excite.parse_string('“Standing in Livestock’s ‘Long Shadow’: The Ethics of Eating Meat on a Small Planet,” Ethics & the Environment 16 (2011): 63-93. (pdf)')
|
69
|
+
title_should_be(cite, 'Standing in Livestock\'s `Long Shadow\': The Ethics of Eating Meat on a Small Planet')
|
70
|
+
end
|
71
|
+
|
72
|
+
it "parses citation prefixed by number" do
|
73
|
+
cite = Excite.parse_string('1. “Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models ” Kamil Wezka ,Philip Salmon, Anita Ziedler, Dean Whittaker, James Drewitt, Stefan Klotz, Harry Fisher and D Marrocchelli, Journal of Physics: Condensed Matter 24 502101 (2012)')
|
74
|
+
title_should_be(cite, 'Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models')
|
75
|
+
end
|
76
|
+
|
77
|
+
it "parses citation prefixed by number without space" do
|
78
|
+
cite = Excite.parse_string("3.“ High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa ” Philip Salmon, James Drewitt, Dean Whittaker, Anita Ziedler, Kamil Wezka, Craig Bull, Mathew Tucker, Martin Wilding, Malcon Guthrie and D Marrocchelli, Journal of Physics: Condensed Matter 24 415102 (2012)")
|
79
|
+
title_should_be(cite, 'High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa')
|
80
|
+
end
|
81
|
+
|
82
|
+
it "parses citation with name not in dict" do
|
83
|
+
cite = Excite.parse_string("John Xkcd, Analyzing Phonetic Variation. Journal of Digital Scholarship\nNov. 2011", "John Xkcd")
|
84
|
+
title_should_be(cite, "Analyzing Phonetic Variation")
|
85
|
+
end
|
86
|
+
|
87
|
+
it "parses citation with parenthetical comment" do
|
88
|
+
cite = Excite.parse_string('The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos (University of Pittsburgh Press 2005). (Awarded the Metaphysical Society of America’s 2007 John N. Findlay Book Prize.)')
|
89
|
+
title_should_be(cite, 'The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos')
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
context "parse html" do
|
95
|
+
|
96
|
+
it "parses cleanly marked-up cite" do
|
97
|
+
cite_str = %{
|
98
|
+
<h3 class="PaperTitle">
|
99
|
+
<span class="AuthorList">Wangyi Liu, Andrea Bertozzi, and Theodore Kolokolnikov,</span>
|
100
|
+
<a class="Title" href="http://www.math.ucla.edu/~bertozzi/papers/CMS-Bobby12-galley.pdf">“Diffuse interface surface tension models in an expanding flow”,</a>
|
101
|
+
<span class="Source">Communications in Mathematical Sciences,</span>
|
102
|
+
<span class="DisplayDate">2012,</span>
|
103
|
+
<span class="Volume">10(1)</span>:<span class="Page">387-418,</span>
|
104
|
+
</h3> }
|
105
|
+
|
106
|
+
cite = Excite.parse_html(cite_str)
|
107
|
+
title_should_be(cite, "Diffuse interface surface tension models in an expanding flow")
|
108
|
+
|
109
|
+
cite[:authors].to_set.should == ["Wangyi Liu", "Andrea Bertozzi", "Theodore Kolokolnikov"].to_set
|
110
|
+
cite[:journal].should == "Communications in Mathematical Sciences"
|
111
|
+
end
|
112
|
+
|
113
|
+
it "parses cite wihout much punctuation" do
|
114
|
+
cite_str = "<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>"
|
115
|
+
|
116
|
+
cite = Excite.parse_html(cite_str)
|
117
|
+
title_should_be(cite, 'Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock')
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
def title_should_be(cite, title)
|
123
|
+
cite[:title].should == title
|
124
|
+
cite.overall_probability.should be_within(0.5).of(0.5)
|
125
|
+
cite.probabilities[:title].should be_within(0.5).of(0.5)
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Excite
|
4
|
+
|
5
|
+
describe CRFParser do
|
6
|
+
|
7
|
+
before do
|
8
|
+
@parser = CRFParser.new
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "normalize_input_author" do
|
12
|
+
|
13
|
+
it "handles blank" do
|
14
|
+
@parser.normalize_input_author(nil).should be_nil
|
15
|
+
@parser.normalize_input_author('').should be_nil
|
16
|
+
end
|
17
|
+
|
18
|
+
it "handles name with junk punctuation" do
|
19
|
+
res = @parser.normalize_input_author("'Gertjan van Noord'")
|
20
|
+
res.should == ['gertjan', 'van', 'noord']
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "tokenizing" do
|
26
|
+
|
27
|
+
describe "html training data" do
|
28
|
+
TAGGED_HTML = "<author> <li>González-Bailón, S. </author> <date> (2009) </date> <title> <a>Traps on the Web</a>. </title> <journal> Information, Communication & Society </journal> <volume> 12 (8) </volume> <pages> 1149-1173.</li> </pages>"
|
29
|
+
|
30
|
+
it "is labeled correctly" do
|
31
|
+
toks = CRFParser.new(:html).prepare_token_data(TAGGED_HTML, true)
|
32
|
+
|
33
|
+
expected = [
|
34
|
+
['González-Bailón','li','author'],
|
35
|
+
[',','li','author'],
|
36
|
+
['S.','li','author'],
|
37
|
+
['(','li','date'],
|
38
|
+
['2009','li','date'],
|
39
|
+
[')','li', 'date'],
|
40
|
+
['Traps','a','title'],
|
41
|
+
['on','a','title'],
|
42
|
+
['the','a','title'],
|
43
|
+
['Web','a','title'],
|
44
|
+
['.','li','title'],
|
45
|
+
['Information','li','journal'],
|
46
|
+
[',','li','journal'],
|
47
|
+
['Communication','li','journal'],
|
48
|
+
['&','li','journal'],
|
49
|
+
['Society','li','journal'],
|
50
|
+
['12','li','volume'],
|
51
|
+
['(','li','volume'],
|
52
|
+
['8','li','volume'],
|
53
|
+
[')','li','volume'],
|
54
|
+
['1149-1173', 'li', 'pages'],
|
55
|
+
['.','li','pages']
|
56
|
+
]
|
57
|
+
|
58
|
+
toks.length.should == expected.length
|
59
|
+
|
60
|
+
expected.each_with_index do |e, i|
|
61
|
+
t = toks[i]
|
62
|
+
t.raw.should == e[0]
|
63
|
+
t.node.parent.name.should == e[1]
|
64
|
+
t.label.should == e[2]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
describe "html test data" do
|
71
|
+
HTML = "<li><b>Author Name</b> (2012) <a>Paper Title.</a><!-- This is a comment -->Journal Title 15:2 123-234.<span> </span></li>"
|
72
|
+
|
73
|
+
it "is stripped of empty tags and comments" do
|
74
|
+
toks = CRFParser.new(:html).prepare_token_data(HTML)
|
75
|
+
|
76
|
+
toks.each do |tok|
|
77
|
+
tok.node.name.should == 'text'
|
78
|
+
tok.node.parent.name.should_not == 'comment'
|
79
|
+
tok.node.parent.name.should_not == 'span'
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
it "is tokenized correctly" do
|
84
|
+
expected = [
|
85
|
+
['Author','b'],
|
86
|
+
['Name','b'],
|
87
|
+
['(', 'li'],
|
88
|
+
['2012', 'li'],
|
89
|
+
[')','li'],
|
90
|
+
['Paper','a'],
|
91
|
+
['Title','a'],
|
92
|
+
['.','a'],
|
93
|
+
['Journal','li'],
|
94
|
+
['Title','li'],
|
95
|
+
['15','li'],
|
96
|
+
[':2','li'],
|
97
|
+
['123-234','li'],
|
98
|
+
['.','li']
|
99
|
+
]
|
100
|
+
|
101
|
+
toks = CRFParser.new(:html).prepare_token_data(HTML)
|
102
|
+
|
103
|
+
toks.length.should == expected.length
|
104
|
+
|
105
|
+
expected.each_with_index do |e, i|
|
106
|
+
t = toks[i]
|
107
|
+
t.raw.should == e[0]
|
108
|
+
t.node.parent.name.should == e[1]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'excite/postprocessor'
|
4
|
+
|
5
|
+
module Excite
|
6
|
+
|
7
|
+
describe Postprocessor do
|
8
|
+
|
9
|
+
describe "normalize_title" do
|
10
|
+
|
11
|
+
it "strips whitespace" do
|
12
|
+
normalize(' a title .').should == 'a title'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "strips punctuation" do
|
16
|
+
normalize('(a title) ').should == 'a title'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "strips leading numerals" do
|
20
|
+
normalize('1. A title').should == 'A title'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "doesn't strip numerals part of the title" do
|
24
|
+
normalize('1 is the best number of titles').should == '1 is the best number of titles'
|
25
|
+
end
|
26
|
+
|
27
|
+
it "strips leading roman numerals" do
|
28
|
+
normalize('xiv. A title').should == 'A title'
|
29
|
+
end
|
30
|
+
|
31
|
+
it "doesn't strip roman numeral-like title starts" do
|
32
|
+
normalize('IVs are needles not titles').should == 'IVs are needles not titles'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "strips leading enumerating letters" do
|
36
|
+
normalize('A. My title').should == 'My title'
|
37
|
+
end
|
38
|
+
|
39
|
+
it "doesn't strip leading single letters" do
|
40
|
+
normalize('A title').should == 'A title'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "extracts title from between quotes" do
|
44
|
+
normalize('"A title" which is cool').should == 'A title'
|
45
|
+
end
|
46
|
+
|
47
|
+
it "doesn't reduce title to quote part" do
|
48
|
+
normalize('This title comments on "some other title": a crappy work').should == 'This title comments on "some other title": a crappy work'
|
49
|
+
end
|
50
|
+
|
51
|
+
it "chops content after a newline" do
|
52
|
+
normalize("A title\nActually an author or journal").should == 'A title'
|
53
|
+
end
|
54
|
+
|
55
|
+
it "doesn't chop content after a newline if there's not enough before the newline" do
|
56
|
+
normalize("A\ntitle mostly after the newline").should == "A\ntitle mostly after the newline"
|
57
|
+
end
|
58
|
+
|
59
|
+
def normalize(title)
|
60
|
+
hsh = { "title" => title }
|
61
|
+
CRFParser.new.normalize_title(hsh)
|
62
|
+
hsh["title"]
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|