excite 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,128 @@
1
+ # encoding: UTF-8
2
+
3
+ describe Excite do
4
+
5
+ context "parse string" do
6
+ it "handles nil" do
7
+ Excite.parse_string(nil).should be_nil
8
+ end
9
+
10
+ it "handles empty string" do
11
+ Excite.parse_string("").should be_nil
12
+ end
13
+
14
+ it "handles string that used to break model" do
15
+ Excite.parse_string("[if gte mso 10]>\r\n<style>\r\n /* Style Definitions */\r\n table.MsoNormalTable\r\n\t{mso-style-name:\"Table Normal\";\r\n\tmso-tstyle-rowband-size:0;\r\n\tmso-tstyle-colband-size:0;\r\n\tmso-style-noshow:yes;\r\n\tmso-style-priority:99;\r\n\tmso-style-parent:\"\";\r\n\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\r\n\tmso-para-margin:0in;\r\n\tmso-para-margin-bottom:.0001pt;\r\n\tmso-pagination:widow-orphan;\r\n\tfont-size:10.0pt;\r\n\tfont-family:\"Times New Roman\",\"serif\";}\r\n</style>\r\n<![endif]")[:year].should be_nil
16
+ end
17
+
18
+ it "handles non-ASCII unicode characters" do
19
+ cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology » The history of the future りがと. New York: Pocket Books.")
20
+ title_should_be(cite, "Star trek chronology » The history of the future")
21
+ end
22
+
23
+ it "handles non-citation string" do
24
+ Excite.parse_string("Recently while contemplating hosting options for my startup I decided to take a look at Heroku.")[:authors].should be_nil
25
+ end
26
+
27
+ it "parses title for APA journal article" do
28
+ cite = Excite.parse_string("Devine, P. G., & Sherman, S. J. (1992). Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock? Psychological Inquiry, 3(2), 153-159. doi:10.1207/s15327965pli0302_13")
29
+ title_should_be(cite, "Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock")
30
+ end
31
+
32
+ it "parses title for Turabian journal article" do
33
+ cite = Excite.parse_string("Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (June): 53-65.")
34
+ title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
35
+ end
36
+
37
+ it "parses title for Turabian book" do
38
+ cite = Excite.parse_string("Okuda, Michael, and Denise Okuda. 1993. Star trek chronology: The history of the future. New York: Pocket Books.")
39
+ title_should_be(cite, "Star trek chronology: The history of the future")
40
+ end
41
+
42
+ it "parses title for MLA newspaper article" do
43
+ cite = Excite.parse_string('Di Rado, Alicia. "Trekking through College: Classes Explore Modern Society Using the World of Star Trek." Los Angeles Times 15 Mar. 1995: A3+. Print.')
44
+ title_should_be(cite, 'Trekking through College: Classes Explore Modern Society Using the World of Star Trek')
45
+ end
46
+
47
+ it "parses title for Chicago journal article" do
48
+ cite = Excite.parse_string('Wilcox, Rhonda V. 1991. Shifting roles and synthetic women in Star trek: The next generation. Studies in Popular Culture 13 (2): 53-65.')
49
+ title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
50
+ end
51
+
52
+ it "parses title for journal article with volume" do
53
+ cite = Excite.parse_string("Watts, S. & Bagnoli, M. (2010). Oligopoly, Disclosure and Earnings Management. The Accounting Review, vol. 85 (4), 1191-1214.")
54
+ title_should_be(cite, "Oligopoly, Disclosure and Earnings Management")
55
+ end
56
+
57
+ it "parses title for MLA journal article" do
58
+ cite = Excite.parse_string('Hodges, F. M. "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s." Aging Male 6.3 (2003)')
59
+ title_should_be(cite, "The Promised Planet: Alliances and Struggles of the Gerontocracy in American Television Science Fiction of the 1960s")
60
+ end
61
+
62
+ it "parses title for AMA journal article" do
63
+ cite = Excite.parse_string("Wilcox RV. Shifting roles and synthetic women in Star trek: The next generation. Stud Pop Culture. 1991;13:53-65.")
64
+ title_should_be(cite, 'Shifting roles and synthetic women in Star trek: The next generation')
65
+ end
66
+
67
+ it "parses quoted journal article title" do
68
+ cite = Excite.parse_string('“Standing in Livestock’s ‘Long Shadow’: The Ethics of Eating Meat on a Small Planet,” Ethics & the Environment 16 (2011): 63-93. (pdf)')
69
+ title_should_be(cite, 'Standing in Livestock\'s `Long Shadow\': The Ethics of Eating Meat on a Small Planet')
70
+ end
71
+
72
+ it "parses citation prefixed by number" do
73
+ cite = Excite.parse_string('1. “Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models ” Kamil Wezka ,Philip Salmon, Anita Ziedler, Dean Whittaker, James Drewitt, Stefan Klotz, Harry Fisher and D Marrocchelli, Journal of Physics: Condensed Matter 24 502101 (2012)')
74
+ title_should_be(cite, 'Mechanisms of network collapse in GeO2 glass: high-pressure neutron diffraction with isotope substitution as arbitrator of competing models')
75
+ end
76
+
77
+ it "parses citation prefixed by number without space" do
78
+ cite = Excite.parse_string("3.“ High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa ” Philip Salmon, James Drewitt, Dean Whittaker, Anita Ziedler, Kamil Wezka, Craig Bull, Mathew Tucker, Martin Wilding, Malcon Guthrie and D Marrocchelli, Journal of Physics: Condensed Matter 24 415102 (2012)")
79
+ title_should_be(cite, 'High pressure neutron diffraction study of GeO2 glass up to 17.5 GPa')
80
+ end
81
+
82
+ it "parses citation with name not in dict" do
83
+ cite = Excite.parse_string("John Xkcd, Analyzing Phonetic Variation. Journal of Digital Scholarship\nNov. 2011", "John Xkcd")
84
+ title_should_be(cite, "Analyzing Phonetic Variation")
85
+ end
86
+
87
+ it "parses citation with parenthetical comment" do
88
+ cite = Excite.parse_string('The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos (University of Pittsburgh Press 2005). (Awarded the Metaphysical Society of America’s 2007 John N. Findlay Book Prize.)')
89
+ title_should_be(cite, 'The Ethics of Creativity: Beauty, Morality, and Nature in a Processive Cosmos')
90
+ end
91
+
92
+ end
93
+
94
+ context "parse html" do
95
+
96
+ it "parses cleanly marked-up cite" do
97
+ cite_str = %{
98
+ <h3 class="PaperTitle">
99
+ <span class="AuthorList">Wangyi Liu, Andrea Bertozzi, and Theodore Kolokolnikov,</span>
100
+ <a class="Title" href="http://www.math.ucla.edu/~bertozzi/papers/CMS-Bobby12-galley.pdf">“Diffuse interface surface tension models in an expanding flow”,</a>
101
+ <span class="Source">Communications in Mathematical Sciences,</span>
102
+ <span class="DisplayDate">2012,</span>
103
+ <span class="Volume">10(1)</span>:<span class="Page">387-418,</span>
104
+ </h3> }
105
+
106
+ cite = Excite.parse_html(cite_str)
107
+ title_should_be(cite, "Diffuse interface surface tension models in an expanding flow")
108
+
109
+ cite[:authors].to_set.should == ["Wangyi Liu", "Andrea Bertozzi", "Theodore Kolokolnikov"].to_set
110
+ cite[:journal].should == "Communications in Mathematical Sciences"
111
+ end
112
+
113
+ it "parses cite wihout much punctuation" do
114
+ cite_str = "<span>Devine, PG, & Sherman, SJ</span><span>(1992)</span><strong>Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock?</strong><em>Psychological Inquiry</em><span>3(2), 153-159</span>"
115
+
116
+ cite = Excite.parse_html(cite_str)
117
+ title_should_be(cite, 'Intuitive versus rational judgment and the role of stereotyping in the human condition: Kirk or Spock')
118
+ end
119
+
120
+ end
121
+
122
+ def title_should_be(cite, title)
123
+ cite[:title].should == title
124
+ cite.overall_probability.should be_within(0.5).of(0.5)
125
+ cite.probabilities[:title].should be_within(0.5).of(0.5)
126
+ end
127
+
128
+ end
@@ -0,0 +1,118 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ describe CRFParser do
6
+
7
+ before do
8
+ @parser = CRFParser.new
9
+ end
10
+
11
+ describe "normalize_input_author" do
12
+
13
+ it "handles blank" do
14
+ @parser.normalize_input_author(nil).should be_nil
15
+ @parser.normalize_input_author('').should be_nil
16
+ end
17
+
18
+ it "handles name with junk punctuation" do
19
+ res = @parser.normalize_input_author("'Gertjan van Noord'")
20
+ res.should == ['gertjan', 'van', 'noord']
21
+ end
22
+
23
+ end
24
+
25
+ describe "tokenizing" do
26
+
27
+ describe "html training data" do
28
+ TAGGED_HTML = "<author> &lt;li&gt;González-Bailón, S. </author> <date> (2009) </date> <title> &lt;a&gt;Traps on the Web&lt;/a&gt;. </title> <journal> Information, Communication &amp; Society </journal> <volume> 12 (8) </volume> <pages> 1149-1173.&lt;/li&gt; </pages>"
29
+
30
+ it "is labeled correctly" do
31
+ toks = CRFParser.new(:html).prepare_token_data(TAGGED_HTML, true)
32
+
33
+ expected = [
34
+ ['González-Bailón','li','author'],
35
+ [',','li','author'],
36
+ ['S.','li','author'],
37
+ ['(','li','date'],
38
+ ['2009','li','date'],
39
+ [')','li', 'date'],
40
+ ['Traps','a','title'],
41
+ ['on','a','title'],
42
+ ['the','a','title'],
43
+ ['Web','a','title'],
44
+ ['.','li','title'],
45
+ ['Information','li','journal'],
46
+ [',','li','journal'],
47
+ ['Communication','li','journal'],
48
+ ['&','li','journal'],
49
+ ['Society','li','journal'],
50
+ ['12','li','volume'],
51
+ ['(','li','volume'],
52
+ ['8','li','volume'],
53
+ [')','li','volume'],
54
+ ['1149-1173', 'li', 'pages'],
55
+ ['.','li','pages']
56
+ ]
57
+
58
+ toks.length.should == expected.length
59
+
60
+ expected.each_with_index do |e, i|
61
+ t = toks[i]
62
+ t.raw.should == e[0]
63
+ t.node.parent.name.should == e[1]
64
+ t.label.should == e[2]
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+ describe "html test data" do
71
+ HTML = "<li><b>Author Name</b> (2012) <a>Paper Title.</a><!-- This is a comment -->Journal Title 15:2 123-234.<span>&nbsp;</span></li>"
72
+
73
+ it "is stripped of empty tags and comments" do
74
+ toks = CRFParser.new(:html).prepare_token_data(HTML)
75
+
76
+ toks.each do |tok|
77
+ tok.node.name.should == 'text'
78
+ tok.node.parent.name.should_not == 'comment'
79
+ tok.node.parent.name.should_not == 'span'
80
+ end
81
+ end
82
+
83
+ it "is tokenized correctly" do
84
+ expected = [
85
+ ['Author','b'],
86
+ ['Name','b'],
87
+ ['(', 'li'],
88
+ ['2012', 'li'],
89
+ [')','li'],
90
+ ['Paper','a'],
91
+ ['Title','a'],
92
+ ['.','a'],
93
+ ['Journal','li'],
94
+ ['Title','li'],
95
+ ['15','li'],
96
+ [':2','li'],
97
+ ['123-234','li'],
98
+ ['.','li']
99
+ ]
100
+
101
+ toks = CRFParser.new(:html).prepare_token_data(HTML)
102
+
103
+ toks.length.should == expected.length
104
+
105
+ expected.each_with_index do |e, i|
106
+ t = toks[i]
107
+ t.raw.should == e[0]
108
+ t.node.parent.name.should == e[1]
109
+ end
110
+ end
111
+
112
+ end
113
+
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -0,0 +1,68 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'excite/postprocessor'
4
+
5
+ module Excite
6
+
7
+ describe Postprocessor do
8
+
9
+ describe "normalize_title" do
10
+
11
+ it "strips whitespace" do
12
+ normalize(' a title .').should == 'a title'
13
+ end
14
+
15
+ it "strips punctuation" do
16
+ normalize('(a title) ').should == 'a title'
17
+ end
18
+
19
+ it "strips leading numerals" do
20
+ normalize('1. A title').should == 'A title'
21
+ end
22
+
23
+ it "doesn't strip numerals part of the title" do
24
+ normalize('1 is the best number of titles').should == '1 is the best number of titles'
25
+ end
26
+
27
+ it "strips leading roman numerals" do
28
+ normalize('xiv. A title').should == 'A title'
29
+ end
30
+
31
+ it "doesn't strip roman numeral-like title starts" do
32
+ normalize('IVs are needles not titles').should == 'IVs are needles not titles'
33
+ end
34
+
35
+ it "strips leading enumerating letters" do
36
+ normalize('A. My title').should == 'My title'
37
+ end
38
+
39
+ it "doesn't strip leading single letters" do
40
+ normalize('A title').should == 'A title'
41
+ end
42
+
43
+ it "extracts title from between quotes" do
44
+ normalize('"A title" which is cool').should == 'A title'
45
+ end
46
+
47
+ it "doesn't reduce title to quote part" do
48
+ normalize('This title comments on "some other title": a crappy work').should == 'This title comments on "some other title": a crappy work'
49
+ end
50
+
51
+ it "chops content after a newline" do
52
+ normalize("A title\nActually an author or journal").should == 'A title'
53
+ end
54
+
55
+ it "doesn't chop content after a newline if there's not enough before the newline" do
56
+ normalize("A\ntitle mostly after the newline").should == "A\ntitle mostly after the newline"
57
+ end
58
+
59
+ def normalize(title)
60
+ hsh = { "title" => title }
61
+ CRFParser.new.normalize_title(hsh)
62
+ hsh["title"]
63
+ end
64
+
65
+ end
66
+
67
+ end
68
+ end