whitepaper 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +10 -7
- data/lib/whitepaper.rb +20 -13
- data/lib/whitepaper/cli.rb +3 -3
- data/lib/whitepaper/engine/acm.rb +1 -0
- data/lib/whitepaper/engine/citeseerx.rb +5 -0
- data/lib/whitepaper/engine/google.rb +54 -4
- data/lib/whitepaper/engine/ieeexplore.rb +3 -1
- data/lib/whitepaper/version.rb +1 -1
- metadata +3 -3
data/README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Whitepaper
|
2
2
|
|
3
|
-
This gem will perform a whitepaper lookup on major scholarly databases.
|
3
|
+
This gem will perform a whitepaper lookup on major scholarly databases. Its purpose is to easily find
|
4
4
|
related papers and organize your paper collection. With this application, you can easily download pdfs
|
5
5
|
or use it as a library to automatically assign metadata.
|
6
6
|
|
7
|
-
Currently, CiteSeerX
|
8
|
-
|
7
|
+
Currently, CiteSeerX, ACM and IEEE are the only databases it uses, along with a
|
8
|
+
google pdf/ps search to find other pdf or ps links to download.
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
@@ -21,7 +21,10 @@ Or install it yourself as:
|
|
21
21
|
|
22
22
|
$ gem install whitepaper
|
23
23
|
|
24
|
-
## Usage
|
24
|
+
## Command-line Usage
|
25
|
+
|
26
|
+
The command-line tool makes it easy to find metadata and download pdf copies of
|
27
|
+
papers found via keyword search. It is really only designed for personal use.
|
25
28
|
|
26
29
|
Display usage:
|
27
30
|
|
@@ -50,7 +53,7 @@ Printing the article's pdf url:
|
|
50
53
|
Finally, you can simply have the app download an article and place it in the
|
51
54
|
current directory. It will name the file as closely to the title as it can.
|
52
55
|
|
53
|
-
Download a
|
56
|
+
Download a PDF by any means necessary by title keyword search:
|
54
57
|
|
55
58
|
whitepaper -d -t "The Design and Implementation of a Log-Structured File System"
|
56
59
|
|
@@ -66,7 +69,7 @@ And require it if necessary: (Your project may auto require libraries in your Ge
|
|
66
69
|
|
67
70
|
Invoke with this simple command to look up a paper with the given terms in the title:
|
68
71
|
|
69
|
-
paper = Whitepaper.find_by_title("
|
72
|
+
paper = Whitepaper.find_by_title("hierarchical file systems are dead")
|
70
73
|
|
71
74
|
This will give you back a Whitepaper::Paper object! To get a pdf url, just go:
|
72
75
|
|
@@ -104,4 +107,4 @@ license. If this is unacceptable for you, please defer to the copyright holder.
|
|
104
107
|
### TODO
|
105
108
|
|
106
109
|
1. Add new output options (JSON, YAML, etc) for better metadata usage by other programs.
|
107
|
-
2. Add new engines (Google Scholar,
|
110
|
+
2. Add new engines (Google Scholar, etc)
|
data/lib/whitepaper.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "whitepaper/version"
|
2
2
|
|
3
3
|
require 'whitepaper/engine/acm'
|
4
|
+
require 'whitepaper/engine/ieeexplore'
|
4
5
|
require 'whitepaper/engine/citeseerx'
|
5
6
|
require 'whitepaper/engine/google'
|
6
7
|
|
@@ -15,22 +16,28 @@ module Whitepaper
|
|
15
16
|
def find_by_title(title)
|
16
17
|
paper_csx = Engine::CiteSeerX.find_by_title(title)
|
17
18
|
paper_acm = Engine::ACM.find_by_title(title)
|
18
|
-
paper_i3e = Engine::
|
19
|
+
paper_i3e = Engine::IEEEXplore.find_by_title(title)
|
19
20
|
|
20
|
-
|
21
|
+
papers = []
|
22
|
+
papers << paper_csx if paper_csx
|
23
|
+
papers << paper_acm if paper_acm
|
24
|
+
papers << paper_i3e if paper_i3e
|
21
25
|
|
22
|
-
|
23
|
-
g = Engine::Google.find_by_title(title)
|
26
|
+
paper = papers.sort{|a,b| b.score_by_title(title) <=> a.score_by_title(title)}.first
|
24
27
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
# Gather pdf and ps links across the open internet
|
29
|
+
g = Engine::Google.find_by_title(title)
|
30
|
+
|
31
|
+
return g if paper.nil?
|
32
|
+
|
33
|
+
paper = Paper.new(paper.title,
|
34
|
+
paper.authors,
|
35
|
+
{:description => paper.description,
|
36
|
+
:keywords => paper.keywords,
|
37
|
+
:year => paper.year,
|
38
|
+
:conference => paper.conference,
|
39
|
+
:pdf_urls => paper.pdf_urls.concat(g.pdf_urls),
|
40
|
+
:ps_urls => paper.ps_urls.concat(g.ps_urls)})
|
34
41
|
|
35
42
|
paper
|
36
43
|
end
|
data/lib/whitepaper/cli.rb
CHANGED
@@ -27,8 +27,8 @@ module Whitepaper
|
|
27
27
|
opts.on('-t', '--by-title KEYWORDS', 'Display the data for the paper with the given KEYWORDS in title') do |title|
|
28
28
|
options[:by_title] = title
|
29
29
|
end
|
30
|
-
|
31
|
-
opts.on('-d', '--download', 'Downloads a pdf of the paper of the paper found') do
|
30
|
+
|
31
|
+
opts.on('-d', '--download', 'Downloads a pdf of the paper of the paper found') do
|
32
32
|
options[:download] = true
|
33
33
|
end
|
34
34
|
|
@@ -76,7 +76,7 @@ module Whitepaper
|
|
76
76
|
paper.download
|
77
77
|
end
|
78
78
|
else
|
79
|
-
puts opts
|
79
|
+
puts @opts
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
@@ -10,6 +10,7 @@ module Whitepaper
|
|
10
10
|
# The url to use to search by title.
|
11
11
|
SEARCH_BY_TITLE_URL2 = "results.cfm?within={title_query}&adv=1&DL=ACM&termzone=Title&allofem={title}"
|
12
12
|
|
13
|
+
# The alternate url to use to search by title.
|
13
14
|
SEARCH_BY_TITLE_URL = "results.cfm?query={title}&querydisp={title}&srt=score%20dsc&short=0&coll=DL&dl=GUIDE&source_disp=&source_query=&since_month=&since_year=&before_month=&before_year=&termshow=matchall&range_query="
|
14
15
|
|
15
16
|
class << self
|
@@ -26,6 +26,11 @@ module Whitepaper
|
|
26
26
|
# get the first link
|
27
27
|
paper = page.search '//div[@id="result_list"]/div[@class="result"]/h3/a'
|
28
28
|
|
29
|
+
if paper.empty?
|
30
|
+
# no results
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
|
29
34
|
paper_link = "#{DOMAIN}#{paper.first.attribute("href")}"
|
30
35
|
|
31
36
|
retrieve_details paper_link
|
@@ -7,11 +7,11 @@ module Whitepaper
|
|
7
7
|
# This engine simply uses a google filetype:pdf search to find paper information.
|
8
8
|
module Google
|
9
9
|
class << self
|
10
|
-
#
|
11
|
-
def
|
10
|
+
# Return the url and title of the first result as a hash with keys :url and :title.
|
11
|
+
def find(url)
|
12
12
|
@agent = Mechanize.new
|
13
13
|
|
14
|
-
page = @agent.get
|
14
|
+
page = @agent.get url
|
15
15
|
|
16
16
|
results = page.search '//h3[@class="r"]'
|
17
17
|
|
@@ -35,11 +35,61 @@ module Whitepaper
|
|
35
35
|
end
|
36
36
|
|
37
37
|
if urls.length > 0
|
38
|
-
|
38
|
+
urls.first
|
39
39
|
else
|
40
40
|
nil
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
# Finds a Whitespace::Paper by looking up a paper with the given title keywords.
|
45
|
+
def find_by_title(title)
|
46
|
+
pdf = find("https://www.google.com/search?q=#{URI::encode(title)}+filetype%3Apdf")
|
47
|
+
ps = find("https://www.google.com/search?q=#{URI::encode(title)}+filetype%3Aps")
|
48
|
+
|
49
|
+
pdf_urls = []
|
50
|
+
ps_urls = []
|
51
|
+
|
52
|
+
pdf_score = score(pdf[:title], title)
|
53
|
+
ps_score = score(ps[:title], title)
|
54
|
+
|
55
|
+
if pdf and pdf_score >= ps_score
|
56
|
+
pdf_urls << pdf[:url]
|
57
|
+
end
|
58
|
+
|
59
|
+
if ps and ps_score >= pdf_score
|
60
|
+
ps_urls << ps[:url]
|
61
|
+
end
|
62
|
+
|
63
|
+
Paper.new(pdf[:title], [], {:pdf_urls => pdf_urls,
|
64
|
+
:ps_urls => ps_urls})
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get an early score rating
|
68
|
+
#--
|
69
|
+
# TODO: move into own class for Whitepaper::Paper
|
70
|
+
def score(title, keywords)
|
71
|
+
keywords = keywords.split(" ").map(&:strip).map(&:downcase)
|
72
|
+
title_words = title.split(" ").map(&:strip).map(&:downcase)
|
73
|
+
|
74
|
+
score = 1.0
|
75
|
+
|
76
|
+
# found words are worth x10
|
77
|
+
# not found words are worth /2
|
78
|
+
|
79
|
+
keywords.each do |k|
|
80
|
+
if title_words.include? k
|
81
|
+
score *= 10.0
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
title_words.each do |k|
|
86
|
+
unless keywords.include? k
|
87
|
+
score /= 2.0
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
score
|
92
|
+
end
|
43
93
|
end
|
44
94
|
end
|
45
95
|
end
|
@@ -2,8 +2,10 @@ module Whitepaper
|
|
2
2
|
module Engine
|
3
3
|
# This engine uses the IEEEXplore database to query metadata about a paper.
|
4
4
|
module IEEEXplore
|
5
|
+
# The domain for IEEEXplore.
|
5
6
|
DOMAIN = "http://ieeexplore.ieee.org"
|
6
7
|
|
8
|
+
# The url to use to search by title keywords.
|
7
9
|
SEARCH_BY_TITLE_URL = "search/searchresult.jsp?reload=true&newsearch=true&queryText={title}&x=60&y=7"
|
8
10
|
|
9
11
|
class << self
|
@@ -56,7 +58,7 @@ module Whitepaper
|
|
56
58
|
links = []
|
57
59
|
ps_links = []
|
58
60
|
|
59
|
-
Paper.new title, authors, {:description =>
|
61
|
+
Paper.new title, authors, {:description => "",
|
60
62
|
:keywords => keywords,
|
61
63
|
:year => year,
|
62
64
|
:conference => conference,
|
data/lib/whitepaper/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whitepaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-04-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -86,7 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
86
86
|
version: '0'
|
87
87
|
requirements: []
|
88
88
|
rubyforge_project:
|
89
|
-
rubygems_version: 1.8.
|
89
|
+
rubygems_version: 1.8.25
|
90
90
|
signing_key:
|
91
91
|
specification_version: 3
|
92
92
|
summary: Finds whitepaper metadata and pdf download links with a basic keyword query
|