excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,128 @@
1
+ # encoding: UTF-8
2
+
3
+ describe Excite do
4
+
5
+ context "parse string" do
6
+ it "handles nil" do
7
+ Excite.parse_string(nil).should be_nil
8
+ end
9
+
10
+ it "handles empty string" do
11
+ Excite.parse_string("").should be_nil
12
+ end
13
+
14
+ it "handles string that used to break model" do
15
+ Excite.parse_string("[if gte mso 10]>\r\n<style>\r\n /* Style Definitions */\r\n table.MsoNormalTable\r\n\t{mso-style-name:\"Table Normal\";\r\n\tmso-tstyle-rowband-size:0;\r\n\tmso-tstyle-colband-size:0;\r\n\tmso-style-noshow:yes;\r\n\tmso-style-priority:99;\r\n\tmso-style-parent:\"\";\r\n\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\r\n\tmso-para-margin:0in;\r\n\tmso-para-margin-bottom:.0001pt;\r\n\tmso-pagination:widow-orphan;\r\n\tfont-size:10.0pt;\r\n\tfont-family:\"Times New Roman\",\"serif\";}\r\n</style>\r\n<![endif]")[:year].should be_nil
16
+ end
17
+
18
+ it "handles non-ASCII unicode characters" do
19
+ cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology » The history of the future りがと. New York: Pocket Books.")
20
+ title_should_be(cite, "Star trek chronology » The history of the future")
21
+ end
22
+
23
+ it "handles non-citation string" do
24
+ Excite.parse_string("Recently while contemplating hosting options for my startup I decided to take a look at Heroku.")[:authors].should be_nil
25
+ end
26
+
27
+ it "parses title for APA journal article" do
28
+ cite = Excite.parse_string("Devine, P. G., & Sherman, S. J. (1992). Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock? Psychological Inquiry, 3(2), 153-159. doi:10.1207/s15327965pli0302_13")
29
+ title_should_be(cite, "Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock")
30
+ end
31
+
32
+ it "parses title for Turabian journal article" do
33
+ cite = Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
34
+ title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
35
+ end
36
+
37
+ it "parses title for Turabian book" do
38
+ cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology: The history of the future. New York: Pocket Books.")
39
+ title_should_be(cite, "Star trek chronology: The history of the future")
40
+ end
41
+
42
+ it "parses title for MLA newspaper article" do
43
+ cite = Excite.parse_string('Di Rado, Alicia. "Trekking through College: Classes Explore Modern Society Using the World of Star Trek." Los Angeles Times 15 Mar. 1995: A3+. Print.')
44
+ title_should_be(cite, 'Trekking through College: Classes Explore Modern Society Using the World of Star Trek')
45
+ end
46
+
47
+ it "parses title for Chicago journal article" do
48
+ cite = Excite.parse_string('Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (2): 53-65.')
49
+ title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
50
+ end
51
+
52
+ it "parses title for journal article with volume" do
53
+ cite = Excite.parse_string("Watts, S. & Bagnoli, M. (2010). Oligopoly, Disclosure and Earnings Management. The Accounting Review, vol. 85 (4), 1191-1214.")
54
+ title_should_be(cite, "Oligopoly, Disclosure and Earnings Management")
55
+ end
56
+
57
+ it "parses title for MLA journal article" do
58
+ cite = Excite.parse_string('Hodges, F. M. "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s." Aging Male 6.3 (2003)')
59
+ title_should_be(cite, "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s")
60
+ end
61
+
62
+ it "parses title for AMA journal article" do
63
+ cite = Excite.parse_string("Wilcox RV. Shifting roles and synthetic women in Star trek: The next generation. Stud Pop Culture. 1991;13:53-65.")
64
+ title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
65
+ end
66
+
67
+ it "parses quoted journal article title" do
68
+ cite = Excite.parse_string('“Standing in Livestock’s ‘Long Shadow’: The Ethics of Eating Meat on a Small Planet,” Ethics & the Environment 16 (2011): 63-93. (pdf)')
69
+ title_should_be(cite, 'Standing in Livestock\'s `Long Shadow\': The Ethics of Eating Meat on a Small Planet')
70
+ end
71
+
72
+ it "parses citation prefixed by number" do
73
+ cite = Excite.parse_string('1. “Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models ” Kamil Wezka ,Philip Salmon, Anita Ziedler, Dean Whittaker, James Drewitt, Stefan Klotz, Harry Fisher and D Marrocchelli, Journal of Physics: Condensed Matter 24 502101 (2012)')
74
+ title_should_be(cite, 'Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models')
75
+ end
76
+
77
+ it "parses citation prefixed by number without space" do
78
+ cite = Excite.parse_string("3.“ High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa ” Philip Salmon, James Drewitt, Dean Whittaker, Anita Ziedler, Kamil Wezka, Craig Bull, Mathew Tucker, Martin Wilding, Malcon Guthrie and D Marrocchelli, Journal of Physics: Condensed Matter 24 415102 (2012)")
79
+ title_should_be(cite, 'High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa')
80
+ end
81
+
82
+ it "parses citation with name not in dict" do
83
+ cite = Excite.parse_string("John Xkcd, Analyzing Phonetic Variation. Journal of Digital Scholarship\nNov. 2011", "John Xkcd")
84
+ title_should_be(cite, "Analyzing Phonetic Variation")
85
+ end
86
+
87
+ it "parses citation with parenthetical comment" do
88
+ cite = Excite.parse_string('The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos (University of Pittsburgh Press 2005). (Awarded the Metaphysical Society of America’s 2007 John N. Findlay Book Prize.)')
89
+ title_should_be(cite, 'The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos')
90
+ end
91
+
92
+ end
93
+
94
+ context "parse html" do
95
+
96
+ it "parses cleanly marked-up cite" do
97
+ cite_str = %{
98
+ <h3 class="PaperTitle">
99
+ <span class="AuthorList">Wangyi Liu, Andrea Bertozzi, and Theodore Kolokolnikov,</span>
100
+ <a class="Title" href="http://www.math.ucla.edu/~bertozzi/papers/CMS-Bobby12-galley.pdf">“Diffuse interface surface tension models in an expanding flow”,</a>
101
+ <span class="Source">Communications in Mathematical Sciences,</span>
102
+ <span class="DisplayDate">2012,</span>
103
+ <span class="Volume">10(1)</span>:<span class="Page">387-418,</span>
104
+ </h3> }
105
+
106
+ cite = Excite.parse_html(cite_str)
107
+ title_should_be(cite, "Diffuse interface surface tension models in an expanding flow")
108
+
109
+ cite[:authors].to_set.should == ["Wangyi Liu", "Andrea Bertozzi", "Theodore Kolokolnikov"].to_set
110
+ cite[:journal].should == "Communications in Mathematical Sciences"
111
+ end
112
+
113
+ it "parses cite wihout much punctuation" do
114
+ cite_str = "<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>"
115
+
116
+ cite = Excite.parse_html(cite_str)
117
+ title_should_be(cite, 'Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock')
118
+ end
119
+
120
+ end
121
+
122
+ def title_should_be(cite, title)
123
+ cite[:title].should == title
124
+ cite.overall_probability.should be_within(0.5).of(0.5)
125
+ cite.probabilities[:title].should be_within(0.5).of(0.5)
126
+ end
127
+
128
+ end
@@ -0,0 +1,118 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ describe CRFParser do
6
+
7
+ before do
8
+ @parser = CRFParser.new
9
+ end
10
+
11
+ describe "normalize_input_author" do
12
+
13
+ it "handles blank" do
14
+ @parser.normalize_input_author(nil).should be_nil
15
+ @parser.normalize_input_author('').should be_nil
16
+ end
17
+
18
+ it "handles name with junk punctuation" do
19
+ res = @parser.normalize_input_author("'Gertjan van Noord'")
20
+ res.should == ['gertjan', 'van', 'noord']
21
+ end
22
+
23
+ end
24
+
25
+ describe "tokenizing" do
26
+
27
+ describe "html training data" do
28
+ TAGGED_HTML = "<author> &lt;li&gt;González-Bailón, S. </author> <date> (2009) </date> <title> &lt;a&gt;Traps on the Web&lt;/a&gt;. </title> <journal> Information, Communication &amp; Society </journal> <volume> 12 (8) </volume> <pages> 1149-1173.&lt;/li&gt; </pages>"
29
+
30
+ it "is labeled correctly" do
31
+ toks = CRFParser.new(:html).prepare_token_data(TAGGED_HTML, true)
32
+
33
+ expected = [
34
+ ['González-Bailón','li','author'],
35
+ [',','li','author'],
36
+ ['S.','li','author'],
37
+ ['(','li','date'],
38
+ ['2009','li','date'],
39
+ [')','li', 'date'],
40
+ ['Traps','a','title'],
41
+ ['on','a','title'],
42
+ ['the','a','title'],
43
+ ['Web','a','title'],
44
+ ['.','li','title'],
45
+ ['Information','li','journal'],
46
+ [',','li','journal'],
47
+ ['Communication','li','journal'],
48
+ ['&','li','journal'],
49
+ ['Society','li','journal'],
50
+ ['12','li','volume'],
51
+ ['(','li','volume'],
52
+ ['8','li','volume'],
53
+ [')','li','volume'],
54
+ ['1149-1173', 'li', 'pages'],
55
+ ['.','li','pages']
56
+ ]
57
+
58
+ toks.length.should == expected.length
59
+
60
+ expected.each_with_index do |e, i|
61
+ t = toks[i]
62
+ t.raw.should == e[0]
63
+ t.node.parent.name.should == e[1]
64
+ t.label.should == e[2]
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+ describe "html test data" do
71
+ HTML = "<li><b>Author Name</b> (2012) <a>Paper Title.</a><!-- This is a comment -->Journal Title 15:2 123-234.<span>&nbsp;</span></li>"
72
+
73
+ it "is stripped of empty tags and comments" do
74
+ toks = CRFParser.new(:html).prepare_token_data(HTML)
75
+
76
+ toks.each do |tok|
77
+ tok.node.name.should == 'text'
78
+ tok.node.parent.name.should_not == 'comment'
79
+ tok.node.parent.name.should_not == 'span'
80
+ end
81
+ end
82
+
83
+ it "is tokenized correctly" do
84
+ expected = [
85
+ ['Author','b'],
86
+ ['Name','b'],
87
+ ['(', 'li'],
88
+ ['2012', 'li'],
89
+ [')','li'],
90
+ ['Paper','a'],
91
+ ['Title','a'],
92
+ ['.','a'],
93
+ ['Journal','li'],
94
+ ['Title','li'],
95
+ ['15','li'],
96
+ [':2','li'],
97
+ ['123-234','li'],
98
+ ['.','li']
99
+ ]
100
+
101
+ toks = CRFParser.new(:html).prepare_token_data(HTML)
102
+
103
+ toks.length.should == expected.length
104
+
105
+ expected.each_with_index do |e, i|
106
+ t = toks[i]
107
+ t.raw.should == e[0]
108
+ t.node.parent.name.should == e[1]
109
+ end
110
+ end
111
+
112
+ end
113
+
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -0,0 +1,68 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'excite/postprocessor'
4
+
5
+ module Excite
6
+
7
+ describe Postprocessor do
8
+
9
+ describe "normalize_title" do
10
+
11
+ it "strips whitespace" do
12
+ normalize(' a title .').should == 'a title'
13
+ end
14
+
15
+ it "strips punctuation" do
16
+ normalize('(a title) ').should == 'a title'
17
+ end
18
+
19
+ it "strips leading numerals" do
20
+ normalize('1. A title').should == 'A title'
21
+ end
22
+
23
+ it "doesn't strip numerals part of the title" do
24
+ normalize('1 is the best number of titles').should == '1 is the best number of titles'
25
+ end
26
+
27
+ it "strips leading roman numerals" do
28
+ normalize('xiv. A title').should == 'A title'
29
+ end
30
+
31
+ it "doesn't strip roman numeral-like title starts" do
32
+ normalize('IVs are needles not titles').should == 'IVs are needles not titles'
33
+ end
34
+
35
+ it "strips leading enumerating letters" do
36
+ normalize('A. My title').should == 'My title'
37
+ end
38
+
39
+ it "doesn't strip leading single letters" do
40
+ normalize('A title').should == 'A title'
41
+ end
42
+
43
+ it "extracts title from between quotes" do
44
+ normalize('"A title" which is cool').should == 'A title'
45
+ end
46
+
47
+ it "doesn't reduce title to quote part" do
48
+ normalize('This title comments on "some other title": a crappy work').should == 'This title comments on "some other title": a crappy work'
49
+ end
50
+
51
+ it "chops content after a newline" do
52
+ normalize("A title\nActually an author or journal").should == 'A title'
53
+ end
54
+
55
+ it "doesn't chop content after a newline if there's not enough before the newline" do
56
+ normalize("A\ntitle mostly after the newline").should == "A\ntitle mostly after the newline"
57
+ end
58
+
59
+ def normalize(title)
60
+ hsh = { "title" => title }
61
+ CRFParser.new.normalize_title(hsh)
62
+ hsh["title"]
63
+ end
64
+
65
+ end
66
+
67
+ end
68
+ end