whitepaper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/whitepaper/engine/acm.rb +89 -0
- data/lib/whitepaper/engine/citeseerx.rb +18 -17
- data/lib/whitepaper/engine/ieeexplore.rb +70 -0
- data/lib/whitepaper/paper.rb +32 -0
- data/lib/whitepaper/version.rb +1 -1
- data/lib/whitepaper.rb +6 -1
- metadata +3 -1
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module Whitepaper
|
4
|
+
module Engine
|
5
|
+
# This engine uses the ACM database to query metadata about a paper.
|
6
|
+
module ACM
|
7
|
+
# The domain to use for ACM.
|
8
|
+
DOMAIN = "https://dl.acm.org"
|
9
|
+
|
10
|
+
# The url to use to search by title.
|
11
|
+
SEARCH_BY_TITLE_URL2 = "results.cfm?within={title_query}&adv=1&DL=ACM&termzone=Title&allofem={title}"
|
12
|
+
|
13
|
+
SEARCH_BY_TITLE_URL = "results.cfm?query={title}&querydisp={title}&srt=score%20dsc&short=0&coll=DL&dl=GUIDE&source_disp=&source_query=&since_month=&since_year=&before_month=&before_year=&termshow=matchall&range_query="
|
14
|
+
|
15
|
+
class << self
|
16
|
+
# Returns a url that will query for the given title keywords.
|
17
|
+
def find_by_title_url(title)
|
18
|
+
"#{DOMAIN}/#{SEARCH_BY_TITLE_URL
|
19
|
+
.gsub(/\{title\}/, title.gsub(/\s/, "+"))
|
20
|
+
.gsub(/\{title_query\}/, "(Title:\"" + title.split(" ").join("\"+or+Title:\"") + "\")")}"
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a Whitespace::Paper by searching for the paper with the given title keywords.
|
24
|
+
def find_by_title(title)
|
25
|
+
@agent = Mechanize.new
|
26
|
+
|
27
|
+
# In case cookies are ever necessary to establish:
|
28
|
+
#page = @agent.get("#{DOMAIN}")
|
29
|
+
#search_url = page.search('//form[@name="qiksearch"]').first.attribute("action").to_s
|
30
|
+
|
31
|
+
page = @agent.get(find_by_title_url(title))
|
32
|
+
|
33
|
+
# get the first link
|
34
|
+
paper = page.search '//a[@class="medium-text"]'
|
35
|
+
|
36
|
+
paper_link = "#{DOMAIN}/#{paper.first.attribute("href")}"
|
37
|
+
|
38
|
+
retrieve_details paper_link
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns a Whitespace::Paper by reading the direct page for a particular paper.
|
42
|
+
def retrieve_details(url)
|
43
|
+
@agent = Mechanize.new
|
44
|
+
|
45
|
+
page = @agent.get url
|
46
|
+
|
47
|
+
get_meta = lambda {|name|
|
48
|
+
meta = page.search "//meta[@name=\"#{name}\"]"
|
49
|
+
if meta.nil? or meta.first.nil?
|
50
|
+
return ""
|
51
|
+
end
|
52
|
+
meta.first.attribute("content").to_s
|
53
|
+
}
|
54
|
+
|
55
|
+
title = get_meta.call("citation_title")
|
56
|
+
authors_raw = get_meta.call("citation_authors")
|
57
|
+
year = get_meta.call("citation_date")
|
58
|
+
year = year[-4..-1] unless year.empty?
|
59
|
+
conference = get_meta.call("citation_conference")
|
60
|
+
publisher = get_meta.call("citation_publisher")
|
61
|
+
|
62
|
+
authors = authors_raw.to_s.split(';').map(&:strip).map do |s|
|
63
|
+
index = s.index(',')
|
64
|
+
if index > 0
|
65
|
+
"#{s[index+2..-1]} #{s[0..index-1]}"
|
66
|
+
else
|
67
|
+
s
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
links = []
|
72
|
+
ps_links = []
|
73
|
+
|
74
|
+
# get abstract
|
75
|
+
abstract_url = page.content.match(/tab_abstract\.cfm\?.*cftoken\=\d+/)[0]
|
76
|
+
abstract = @agent.get(abstract_url).root.text.to_s.strip
|
77
|
+
|
78
|
+
Paper.new title, authors, {:description => abstract,
|
79
|
+
:keywords => [],
|
80
|
+
:metadata_url => url,
|
81
|
+
:year => year,
|
82
|
+
:conference => conference,
|
83
|
+
:pdf_urls => links,
|
84
|
+
:ps_urls => ps_links}
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -42,7 +42,7 @@ module Whitepaper
|
|
42
42
|
if meta.nil? or meta.first.nil?
|
43
43
|
return ""
|
44
44
|
end
|
45
|
-
meta.first.attribute
|
45
|
+
meta.first.attribute("content").to_s
|
46
46
|
}
|
47
47
|
|
48
48
|
description = get_meta.call("description")
|
@@ -60,32 +60,33 @@ module Whitepaper
|
|
60
60
|
|
61
61
|
link_url = page.search '//ul[@id="clinks"]/li/a'
|
62
62
|
link_url.each do |l|
|
63
|
-
|
64
|
-
if
|
65
|
-
links <<
|
63
|
+
purl = "#{DOMAIN}#{l.attribute("href").to_s}"
|
64
|
+
if purl.end_with? "pdf"
|
65
|
+
links << purl
|
66
66
|
end
|
67
|
-
if
|
68
|
-
ps_links <<
|
67
|
+
if purl.end_with? "ps"
|
68
|
+
ps_links << purl
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
72
72
|
link_url = page.search '//ul[@id="dlinks"]/li/a'
|
73
73
|
link_url.each do |l|
|
74
|
-
|
75
|
-
if
|
76
|
-
links <<
|
74
|
+
purl = l.attribute("href").to_s
|
75
|
+
if purl.end_with? "pdf"
|
76
|
+
links << purl
|
77
77
|
end
|
78
|
-
if
|
79
|
-
ps_links <<
|
78
|
+
if purl.end_with? "ps"
|
79
|
+
ps_links << purl
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
-
Paper.new title, authors, {:description
|
84
|
-
:keywords
|
85
|
-
:year
|
86
|
-
:conference
|
87
|
-
:
|
88
|
-
:
|
83
|
+
Paper.new title, authors, {:description => description,
|
84
|
+
:keywords => keywords,
|
85
|
+
:year => year,
|
86
|
+
:conference => conference,
|
87
|
+
:metadata_url => url,
|
88
|
+
:pdf_urls => links,
|
89
|
+
:ps_urls => ps_links}
|
89
90
|
end
|
90
91
|
end
|
91
92
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Whitepaper
|
2
|
+
module Engine
|
3
|
+
# This engine uses the IEEEXplore database to query metadata about a paper.
|
4
|
+
module IEEEXplore
|
5
|
+
DOMAIN = "http://ieeexplore.ieee.org"
|
6
|
+
|
7
|
+
SEARCH_BY_TITLE_URL = "search/searchresult.jsp?reload=true&newsearch=true&queryText={title}&x=60&y=7"
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Returns a url that will query for the given title keywords
|
11
|
+
def find_by_title_url(title)
|
12
|
+
"#{DOMAIN}/#{SEARCH_BY_TITLE_URL.gsub(/\{title\}/, title.gsub(/\s/, "+"))}"
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a Whitespace::Paper by searching for the paper with the given title keywords.
|
16
|
+
def find_by_title(title)
|
17
|
+
@agent = Mechanize.new
|
18
|
+
page = @agent.get "#{find_by_title_url(title)}"
|
19
|
+
|
20
|
+
# get the first link
|
21
|
+
paper = page.search '//div[@class="detail"]/h3/a'
|
22
|
+
|
23
|
+
paper_link = "#{DOMAIN}#{paper.first.attribute("href")}"
|
24
|
+
|
25
|
+
retrieve_details paper_link
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns a Whitespace::Paper by reading the direct page for a particular paper.
|
29
|
+
def retrieve_details(url)
|
30
|
+
@agent = Mechanize.new
|
31
|
+
|
32
|
+
page = @agent.get url
|
33
|
+
|
34
|
+
get_meta = lambda {|name|
|
35
|
+
meta = page.search "//meta[@property=\"#{name}\"]"
|
36
|
+
if meta.nil? or meta.first.nil?
|
37
|
+
return ""
|
38
|
+
end
|
39
|
+
meta.first.attribute("content").to_s
|
40
|
+
}
|
41
|
+
|
42
|
+
keywords_raw = get_meta.call("keywords")
|
43
|
+
title = get_meta.call("citation_title")
|
44
|
+
year = get_meta.call("citation_date")
|
45
|
+
year = year[-4..-1] unless year.empty?
|
46
|
+
conference = get_meta.call("citation_conference")
|
47
|
+
|
48
|
+
authors = []
|
49
|
+
meta = page.search "//meta[@property=\"citation_author\"]"
|
50
|
+
meta.each do |e|
|
51
|
+
authors << e.attribute("content").to_s.strip
|
52
|
+
end
|
53
|
+
|
54
|
+
keywords = keywords_raw.to_s.split(';').map(&:strip)
|
55
|
+
|
56
|
+
links = []
|
57
|
+
ps_links = []
|
58
|
+
|
59
|
+
Paper.new title, authors, {:description => description,
|
60
|
+
:keywords => keywords,
|
61
|
+
:year => year,
|
62
|
+
:conference => conference,
|
63
|
+
:metadata_url => url,
|
64
|
+
:pdf_urls => links,
|
65
|
+
:ps_urls => ps_links}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
data/lib/whitepaper/paper.rb
CHANGED
@@ -19,6 +19,10 @@ module Whitepaper
|
|
19
19
|
# The conference, if any, the paper appeared. Defaults to "".
|
20
20
|
attr_reader :conference
|
21
21
|
|
22
|
+
# The link to the resource with the most metadata to use as attribution.
|
23
|
+
# Defaults to "".
|
24
|
+
attr_reader :metadata_url
|
25
|
+
|
22
26
|
# A list of urls to pdf copies of the paper. Defaults to [].
|
23
27
|
attr_reader :pdf_urls
|
24
28
|
|
@@ -34,6 +38,7 @@ module Whitepaper
|
|
34
38
|
@keywords = options[:keywords] || []
|
35
39
|
@year = options[:year] || ""
|
36
40
|
@conference = options[:conference] || ""
|
41
|
+
@metadata_url = options[:metadata_url] || ""
|
37
42
|
|
38
43
|
@pdf_urls = options[:pdf_urls] || []
|
39
44
|
@ps_urls = options[:ps_urls] || []
|
@@ -70,6 +75,32 @@ module Whitepaper
|
|
70
75
|
true
|
71
76
|
end
|
72
77
|
|
78
|
+
# Gives a score of relevancy to the title keywords given. Higher scores
|
79
|
+
# mean that the keywords are more reflective of the title.
|
80
|
+
def score_by_title(keywords)
|
81
|
+
keywords = keywords.split(" ").map(&:strip).map(&:downcase)
|
82
|
+
title_words = title.split(" ").map(&:strip).map(&:downcase)
|
83
|
+
|
84
|
+
score = 1.0
|
85
|
+
|
86
|
+
# found words are worth x10
|
87
|
+
# not found words are worth /2
|
88
|
+
|
89
|
+
keywords.each do |k|
|
90
|
+
if title_words.include? k
|
91
|
+
score *= 10.0
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
title_words.each do |k|
|
96
|
+
unless keywords.include? k
|
97
|
+
score /= 2.0
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
score
|
102
|
+
end
|
103
|
+
|
73
104
|
# Output a simple description of the paper metadata.
|
74
105
|
def to_s
|
75
106
|
"Title: #{@title}\n" +
|
@@ -78,6 +109,7 @@ module Whitepaper
|
|
78
109
|
"Keywords: #{@keywords}\n" +
|
79
110
|
"Year: #{@year}\n" +
|
80
111
|
"Conference: #{@conference}\n" +
|
112
|
+
"More info: #{@metadata_url}\n" +
|
81
113
|
|
82
114
|
"Pdf Available: #{@pdf_urls}\n" +
|
83
115
|
"Ps Available: #{@ps_urls}"
|
data/lib/whitepaper/version.rb
CHANGED
data/lib/whitepaper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "whitepaper/version"
|
2
2
|
|
3
|
+
require 'whitepaper/engine/acm'
|
3
4
|
require 'whitepaper/engine/citeseerx'
|
4
5
|
require 'whitepaper/engine/google'
|
5
6
|
|
@@ -12,7 +13,11 @@ module Whitepaper
|
|
12
13
|
class << self
|
13
14
|
# Find and return a Whitepaper::Paper by searching for a partial match with the given title.
|
14
15
|
def find_by_title(title)
|
15
|
-
|
16
|
+
paper_csx = Engine::CiteSeerX.find_by_title(title)
|
17
|
+
paper_acm = Engine::ACM.find_by_title(title)
|
18
|
+
paper_i3e = Engine::ACM.find_by_title(title)
|
19
|
+
|
20
|
+
paper = [paper_csx, paper_i3e, paper_acm].sort{|a,b| b.score_by_title(title) <=> a.score_by_title(title)}.first
|
16
21
|
|
17
22
|
if paper.pdf_urls.empty?
|
18
23
|
g = Engine::Google.find_by_title(title)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whitepaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -59,8 +59,10 @@ files:
|
|
59
59
|
- bin/whitepaper
|
60
60
|
- lib/whitepaper.rb
|
61
61
|
- lib/whitepaper/cli.rb
|
62
|
+
- lib/whitepaper/engine/acm.rb
|
62
63
|
- lib/whitepaper/engine/citeseerx.rb
|
63
64
|
- lib/whitepaper/engine/google.rb
|
65
|
+
- lib/whitepaper/engine/ieeexplore.rb
|
64
66
|
- lib/whitepaper/paper.rb
|
65
67
|
- lib/whitepaper/version.rb
|
66
68
|
- whitepaper.gemspec
|