whitepaper 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +10 -7
- data/lib/whitepaper.rb +20 -13
- data/lib/whitepaper/cli.rb +3 -3
- data/lib/whitepaper/engine/acm.rb +1 -0
- data/lib/whitepaper/engine/citeseerx.rb +5 -0
- data/lib/whitepaper/engine/google.rb +54 -4
- data/lib/whitepaper/engine/ieeexplore.rb +3 -1
- data/lib/whitepaper/version.rb +1 -1
- metadata +3 -3
data/README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Whitepaper
|
2
2
|
|
3
|
-
This gem will perform a whitepaper lookup on major scholarly databases.
|
3
|
+
This gem will perform a whitepaper lookup on major scholarly databases. Its purpose is to easily find
|
4
4
|
related papers and organize your paper collection. With this application, you can easily download pdfs
|
5
5
|
or use it as a library to automatically assign metadata.
|
6
6
|
|
7
|
-
Currently, CiteSeerX
|
8
|
-
|
7
|
+
Currently, CiteSeerX, ACM and IEEE are the only databases it uses, along with a
|
8
|
+
google pdf/ps search to find other pdf or ps links to download.
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
@@ -21,7 +21,10 @@ Or install it yourself as:
|
|
21
21
|
|
22
22
|
$ gem install whitepaper
|
23
23
|
|
24
|
-
## Usage
|
24
|
+
## Command-line Usage
|
25
|
+
|
26
|
+
The command-line tool makes it easy to find metadata and download pdf copies of
|
27
|
+
papers found via keyword search. It is really only designed for personal use.
|
25
28
|
|
26
29
|
Display usage:
|
27
30
|
|
@@ -50,7 +53,7 @@ Printing the article's pdf url:
|
|
50
53
|
Finally, you can simply have the app download an article and place it in the
|
51
54
|
current directory. It will name the file as closely to the title as it can.
|
52
55
|
|
53
|
-
Download a
|
56
|
+
Download a PDF by any means necessary by title keyword search:
|
54
57
|
|
55
58
|
whitepaper -d -t "The Design and Implementation of a Log-Structured File System"
|
56
59
|
|
@@ -66,7 +69,7 @@ And require it if necessary: (Your project may auto require libraries in your Ge
|
|
66
69
|
|
67
70
|
Invoke with this simple command to look up a paper with the given terms in the title:
|
68
71
|
|
69
|
-
paper = Whitepaper.find_by_title("
|
72
|
+
paper = Whitepaper.find_by_title("hierarchical file systems are dead")
|
70
73
|
|
71
74
|
This will give you back a Whitepaper::Paper object! To get a pdf url, just go:
|
72
75
|
|
@@ -104,4 +107,4 @@ license. If this is unacceptable for you, please defer to the copyright holder.
|
|
104
107
|
### TODO
|
105
108
|
|
106
109
|
1. Add new output options (JSON, YAML, etc) for better metadata usage by other programs.
|
107
|
-
2. Add new engines (Google Scholar,
|
110
|
+
2. Add new engines (Google Scholar, etc)
|
data/lib/whitepaper.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "whitepaper/version"
|
2
2
|
|
3
3
|
require 'whitepaper/engine/acm'
|
4
|
+
require 'whitepaper/engine/ieeexplore'
|
4
5
|
require 'whitepaper/engine/citeseerx'
|
5
6
|
require 'whitepaper/engine/google'
|
6
7
|
|
@@ -15,22 +16,28 @@ module Whitepaper
|
|
15
16
|
def find_by_title(title)
|
16
17
|
paper_csx = Engine::CiteSeerX.find_by_title(title)
|
17
18
|
paper_acm = Engine::ACM.find_by_title(title)
|
18
|
-
paper_i3e = Engine::
|
19
|
+
paper_i3e = Engine::IEEEXplore.find_by_title(title)
|
19
20
|
|
20
|
-
|
21
|
+
papers = []
|
22
|
+
papers << paper_csx if paper_csx
|
23
|
+
papers << paper_acm if paper_acm
|
24
|
+
papers << paper_i3e if paper_i3e
|
21
25
|
|
22
|
-
|
23
|
-
g = Engine::Google.find_by_title(title)
|
26
|
+
paper = papers.sort{|a,b| b.score_by_title(title) <=> a.score_by_title(title)}.first
|
24
27
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
# Gather pdf and ps links across the open internet
|
29
|
+
g = Engine::Google.find_by_title(title)
|
30
|
+
|
31
|
+
return g if paper.nil?
|
32
|
+
|
33
|
+
paper = Paper.new(paper.title,
|
34
|
+
paper.authors,
|
35
|
+
{:description => paper.description,
|
36
|
+
:keywords => paper.keywords,
|
37
|
+
:year => paper.year,
|
38
|
+
:conference => paper.conference,
|
39
|
+
:pdf_urls => paper.pdf_urls.concat(g.pdf_urls),
|
40
|
+
:ps_urls => paper.ps_urls.concat(g.ps_urls)})
|
34
41
|
|
35
42
|
paper
|
36
43
|
end
|
data/lib/whitepaper/cli.rb
CHANGED
@@ -27,8 +27,8 @@ module Whitepaper
|
|
27
27
|
opts.on('-t', '--by-title KEYWORDS', 'Display the data for the paper with the given KEYWORDS in title') do |title|
|
28
28
|
options[:by_title] = title
|
29
29
|
end
|
30
|
-
|
31
|
-
opts.on('-d', '--download', 'Downloads a pdf of the paper of the paper found') do
|
30
|
+
|
31
|
+
opts.on('-d', '--download', 'Downloads a pdf of the paper of the paper found') do
|
32
32
|
options[:download] = true
|
33
33
|
end
|
34
34
|
|
@@ -76,7 +76,7 @@ module Whitepaper
|
|
76
76
|
paper.download
|
77
77
|
end
|
78
78
|
else
|
79
|
-
puts opts
|
79
|
+
puts @opts
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
@@ -10,6 +10,7 @@ module Whitepaper
|
|
10
10
|
# The url to use to search by title.
|
11
11
|
SEARCH_BY_TITLE_URL2 = "results.cfm?within={title_query}&adv=1&DL=ACM&termzone=Title&allofem={title}"
|
12
12
|
|
13
|
+
# The alternate url to use to search by title.
|
13
14
|
SEARCH_BY_TITLE_URL = "results.cfm?query={title}&querydisp={title}&srt=score%20dsc&short=0&coll=DL&dl=GUIDE&source_disp=&source_query=&since_month=&since_year=&before_month=&before_year=&termshow=matchall&range_query="
|
14
15
|
|
15
16
|
class << self
|
@@ -26,6 +26,11 @@ module Whitepaper
|
|
26
26
|
# get the first link
|
27
27
|
paper = page.search '//div[@id="result_list"]/div[@class="result"]/h3/a'
|
28
28
|
|
29
|
+
if paper.empty?
|
30
|
+
# no results
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
|
29
34
|
paper_link = "#{DOMAIN}#{paper.first.attribute("href")}"
|
30
35
|
|
31
36
|
retrieve_details paper_link
|
@@ -7,11 +7,11 @@ module Whitepaper
|
|
7
7
|
# This engine simply uses a google filetype:pdf search to find paper information.
|
8
8
|
module Google
|
9
9
|
class << self
|
10
|
-
#
|
11
|
-
def
|
10
|
+
# Return the url and title of the first result as a hash with keys :url and :title.
|
11
|
+
def find(url)
|
12
12
|
@agent = Mechanize.new
|
13
13
|
|
14
|
-
page = @agent.get
|
14
|
+
page = @agent.get url
|
15
15
|
|
16
16
|
results = page.search '//h3[@class="r"]'
|
17
17
|
|
@@ -35,11 +35,61 @@ module Whitepaper
|
|
35
35
|
end
|
36
36
|
|
37
37
|
if urls.length > 0
|
38
|
-
|
38
|
+
urls.first
|
39
39
|
else
|
40
40
|
nil
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
# Finds a Whitespace::Paper by looking up a paper with the given title keywords.
|
45
|
+
def find_by_title(title)
|
46
|
+
pdf = find("https://www.google.com/search?q=#{URI::encode(title)}+filetype%3Apdf")
|
47
|
+
ps = find("https://www.google.com/search?q=#{URI::encode(title)}+filetype%3Aps")
|
48
|
+
|
49
|
+
pdf_urls = []
|
50
|
+
ps_urls = []
|
51
|
+
|
52
|
+
pdf_score = score(pdf[:title], title)
|
53
|
+
ps_score = score(ps[:title], title)
|
54
|
+
|
55
|
+
if pdf and pdf_score >= ps_score
|
56
|
+
pdf_urls << pdf[:url]
|
57
|
+
end
|
58
|
+
|
59
|
+
if ps and ps_score >= pdf_score
|
60
|
+
ps_urls << ps[:url]
|
61
|
+
end
|
62
|
+
|
63
|
+
Paper.new(pdf[:title], [], {:pdf_urls => pdf_urls,
|
64
|
+
:ps_urls => ps_urls})
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get an early score rating
|
68
|
+
#--
|
69
|
+
# TODO: move into own class for Whitepaper::Paper
|
70
|
+
def score(title, keywords)
|
71
|
+
keywords = keywords.split(" ").map(&:strip).map(&:downcase)
|
72
|
+
title_words = title.split(" ").map(&:strip).map(&:downcase)
|
73
|
+
|
74
|
+
score = 1.0
|
75
|
+
|
76
|
+
# found words are worth x10
|
77
|
+
# not found words are worth /2
|
78
|
+
|
79
|
+
keywords.each do |k|
|
80
|
+
if title_words.include? k
|
81
|
+
score *= 10.0
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
title_words.each do |k|
|
86
|
+
unless keywords.include? k
|
87
|
+
score /= 2.0
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
score
|
92
|
+
end
|
43
93
|
end
|
44
94
|
end
|
45
95
|
end
|
@@ -2,8 +2,10 @@ module Whitepaper
|
|
2
2
|
module Engine
|
3
3
|
# This engine uses the IEEEXplore database to query metadata about a paper.
|
4
4
|
module IEEEXplore
|
5
|
+
# The domain for IEEEXplore.
|
5
6
|
DOMAIN = "http://ieeexplore.ieee.org"
|
6
7
|
|
8
|
+
# The url to use to search by title keywords.
|
7
9
|
SEARCH_BY_TITLE_URL = "search/searchresult.jsp?reload=true&newsearch=true&queryText={title}&x=60&y=7"
|
8
10
|
|
9
11
|
class << self
|
@@ -56,7 +58,7 @@ module Whitepaper
|
|
56
58
|
links = []
|
57
59
|
ps_links = []
|
58
60
|
|
59
|
-
Paper.new title, authors, {:description =>
|
61
|
+
Paper.new title, authors, {:description => "",
|
60
62
|
:keywords => keywords,
|
61
63
|
:year => year,
|
62
64
|
:conference => conference,
|
data/lib/whitepaper/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whitepaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-04-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -86,7 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
86
86
|
version: '0'
|
87
87
|
requirements: []
|
88
88
|
rubyforge_project:
|
89
|
-
rubygems_version: 1.8.
|
89
|
+
rubygems_version: 1.8.25
|
90
90
|
signing_key:
|
91
91
|
specification_version: 3
|
92
92
|
summary: Finds whitepaper metadata and pdf download links with a basic keyword query
|