whitepaper 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/whitepaper/cli.rb +4 -5
- data/lib/whitepaper/engine/citeseerx.rb +16 -9
- data/lib/whitepaper/engine/google.rb +2 -0
- data/lib/whitepaper/paper.rb +21 -0
- data/lib/whitepaper/version.rb +2 -1
- data/lib/whitepaper.rb +11 -0
- data/whitepaper.gemspec +1 -1
- metadata +3 -4
- data/lib/whitepaper/finder.rb +0 -4
data/lib/whitepaper/cli.rb
CHANGED
@@ -3,11 +3,14 @@ require 'optparse'
|
|
3
3
|
require 'whitepaper'
|
4
4
|
|
5
5
|
module Whitepaper
|
6
|
+
# The commandline interface to Whitespace.
|
6
7
|
class CLI
|
8
|
+
# Usage banner
|
7
9
|
BANNER = <<-USAGE
|
8
10
|
USAGE
|
9
11
|
|
10
12
|
class << self
|
13
|
+
# Parse and respond to the command line options.
|
11
14
|
def parse_options
|
12
15
|
options = {}
|
13
16
|
@opts = OptionParser.new do |opts|
|
@@ -77,6 +80,7 @@ module Whitepaper
|
|
77
80
|
end
|
78
81
|
end
|
79
82
|
|
83
|
+
# Executes the command line version of whitespace.
|
80
84
|
def CLI.run
|
81
85
|
begin
|
82
86
|
parse_options
|
@@ -85,11 +89,6 @@ module Whitepaper
|
|
85
89
|
exit -1
|
86
90
|
end
|
87
91
|
|
88
|
-
def fail
|
89
|
-
puts @opts
|
90
|
-
exit -1
|
91
|
-
end
|
92
|
-
|
93
92
|
# Default
|
94
93
|
puts BANNER
|
95
94
|
exit 0
|
@@ -4,15 +4,21 @@ require 'whitepaper/paper'
|
|
4
4
|
|
5
5
|
module Whitepaper
|
6
6
|
module Engine
|
7
|
+
# This engine uses the CiteSeerX database to query metadata about a paper.
|
7
8
|
module CiteSeerX
|
9
|
+
# The domain to use for CiteSeerX.
|
8
10
|
DOMAIN = "http://citeseerx.ist.psu.edu"
|
11
|
+
|
12
|
+
# The url to use to search by title.
|
9
13
|
SEARCH_BY_TITLE_URL = "search?q=title%3A{title}&t=doc&sort=cite"
|
10
14
|
|
11
15
|
class << self
|
16
|
+
# Returns a url that will query for the given title keywords.
|
12
17
|
def find_by_title_url(title)
|
13
18
|
"#{DOMAIN}/#{SEARCH_BY_TITLE_URL.gsub(/\{title\}/, title)}"
|
14
19
|
end
|
15
20
|
|
21
|
+
# Returns a Whitespace::Paper by searching for the paper with the given title keywords.
|
16
22
|
def find_by_title(title)
|
17
23
|
@agent = Mechanize.new
|
18
24
|
page = @agent.get "#{find_by_title_url(title)}"
|
@@ -25,25 +31,26 @@ module Whitepaper
|
|
25
31
|
retrieve_details paper_link
|
26
32
|
end
|
27
33
|
|
34
|
+
# Returns a Whitespace::Paper by reading the direct page for a particular paper.
|
28
35
|
def retrieve_details(url)
|
29
36
|
@agent = Mechanize.new
|
30
37
|
|
31
38
|
page = @agent.get url
|
32
39
|
|
33
|
-
|
40
|
+
get_meta = lambda {|name|
|
34
41
|
meta = page.search "//meta[@name=\"#{name}\"]"
|
35
42
|
if meta.nil? or meta.first.nil?
|
36
43
|
return ""
|
37
44
|
end
|
38
45
|
meta.first.attribute "content"
|
39
|
-
|
40
|
-
|
41
|
-
description = get_meta("description"
|
42
|
-
keywords_raw = get_meta("keywords"
|
43
|
-
title = get_meta("citation_title"
|
44
|
-
authors_raw = get_meta("citation_authors"
|
45
|
-
year = get_meta("citation_year"
|
46
|
-
conference = get_meta("citation_conference"
|
46
|
+
}
|
47
|
+
|
48
|
+
description = get_meta.call("description")
|
49
|
+
keywords_raw = get_meta.call("keywords")
|
50
|
+
title = get_meta.call("citation_title")
|
51
|
+
authors_raw = get_meta.call("citation_authors")
|
52
|
+
year = get_meta.call("citation_year")
|
53
|
+
conference = get_meta.call("citation_conference")
|
47
54
|
|
48
55
|
authors = authors_raw.to_s.split(',').map(&:strip)
|
49
56
|
keywords = keywords_raw.to_s.split(',').map(&:strip)
|
@@ -4,8 +4,10 @@ require 'whitepaper/paper'
|
|
4
4
|
|
5
5
|
module Whitepaper
|
6
6
|
module Engine
|
7
|
+
# This engine simply uses a google filetype:pdf search to find paper information.
|
7
8
|
module Google
|
8
9
|
class << self
|
10
|
+
# Finds a Whitespace::Paper by looking up a paper with the given title keywords.
|
9
11
|
def find_by_title(title)
|
10
12
|
@agent = Mechanize.new
|
11
13
|
|
data/lib/whitepaper/paper.rb
CHANGED
@@ -1,15 +1,32 @@
|
|
1
1
|
module Whitepaper
|
2
|
+
# The representation of a paper, including title, author, and pdf urls.
|
2
3
|
class Paper
|
4
|
+
# The title of the paper.
|
3
5
|
attr_reader :title
|
6
|
+
|
7
|
+
# The list of authors of the paper.
|
4
8
|
attr_reader :authors
|
9
|
+
|
10
|
+
# A summary of the paper, typically an abstract. Defaults to "".
|
5
11
|
attr_reader :description
|
12
|
+
|
13
|
+
# A list of keywords associated with the paper. Defaults to [].
|
6
14
|
attr_reader :keywords
|
15
|
+
|
16
|
+
# The year of publication. Defaults to "".
|
7
17
|
attr_reader :year
|
18
|
+
|
19
|
+
# The conference, if any, the paper appeared. Defaults to "".
|
8
20
|
attr_reader :conference
|
9
21
|
|
22
|
+
# A list of urls to pdf copies of the paper. Defaults to [].
|
10
23
|
attr_reader :pdf_urls
|
24
|
+
|
25
|
+
# A list of urls to ps copies of the paper. Defaults to [].
|
11
26
|
attr_reader :ps_urls
|
12
27
|
|
28
|
+
# Construct an object representing paper metadata with the given fields.
|
29
|
+
# Title and authors are required, all other fields can be omitted.
|
13
30
|
def initialize(title, authors, options = {})
|
14
31
|
@title = title
|
15
32
|
@authors = authors
|
@@ -22,6 +39,9 @@ module Whitepaper
|
|
22
39
|
@ps_urls = options[:ps_urls] || []
|
23
40
|
end
|
24
41
|
|
42
|
+
# Downloads the paper by using the pdf urls. The created file will be named
|
43
|
+
# after the title if no filename is given. The file will overwrite any existing
|
44
|
+
# file with the same name in the current directory.
|
25
45
|
def download(filename = nil)
|
26
46
|
if filename.nil?
|
27
47
|
filename = title.to_s
|
@@ -50,6 +70,7 @@ module Whitepaper
|
|
50
70
|
true
|
51
71
|
end
|
52
72
|
|
73
|
+
# Output a simple description of the paper metadata.
|
53
74
|
def to_s
|
54
75
|
"Title: #{@title}\n" +
|
55
76
|
"Authors: #{@authors}\n" +
|
data/lib/whitepaper/version.rb
CHANGED
data/lib/whitepaper.rb
CHANGED
@@ -3,8 +3,14 @@ require "whitepaper/version"
|
|
3
3
|
require 'whitepaper/engine/citeseerx'
|
4
4
|
require 'whitepaper/engine/google'
|
5
5
|
|
6
|
+
# The namespace for the available metadata gathering engines.
|
7
|
+
module Whitepaper::Engine
|
8
|
+
end
|
9
|
+
|
10
|
+
# The main module encapsulating Whitepaper resources.
|
6
11
|
module Whitepaper
|
7
12
|
class << self
|
13
|
+
# Find and return a Whitepaper::Paper by searching for a partial match with the given title.
|
8
14
|
def find_by_title(title)
|
9
15
|
paper = Engine::CiteSeerX.find_by_title(title)
|
10
16
|
|
@@ -24,6 +30,7 @@ module Whitepaper
|
|
24
30
|
paper
|
25
31
|
end
|
26
32
|
|
33
|
+
# Find and return a list of authors by searching for a partial match with the given title.
|
27
34
|
def find_authors_by_title(title)
|
28
35
|
paper = find_by_title(title)
|
29
36
|
|
@@ -32,6 +39,7 @@ module Whitepaper
|
|
32
39
|
end
|
33
40
|
end
|
34
41
|
|
42
|
+
# Find and return the proper title by searching for a partial match with the given title.
|
35
43
|
def find_title_by_title(title)
|
36
44
|
paper = find_by_title(title)
|
37
45
|
|
@@ -40,6 +48,7 @@ module Whitepaper
|
|
40
48
|
end
|
41
49
|
end
|
42
50
|
|
51
|
+
# Find and return a list of pdf urls by searching for a partial match with the given title.
|
43
52
|
def find_pdfs_by_title(title)
|
44
53
|
paper = find_by_title(title)
|
45
54
|
|
@@ -48,6 +57,8 @@ module Whitepaper
|
|
48
57
|
end
|
49
58
|
end
|
50
59
|
|
60
|
+
# Downloads the first available pdf by searching for a partial match with the given title.
|
61
|
+
# The name of the file will be the title of the paper.
|
51
62
|
def download_pdf_by_title(title)
|
52
63
|
paper = find_by_title(title)
|
53
64
|
paper.download
|
data/whitepaper.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
|
|
10
10
|
gem.email = ["wilkie05@gmail.com"]
|
11
11
|
gem.description = %q{Finds metadata on scholarly works and is able to download pdfs of whitepapers.}
|
12
12
|
gem.summary = %q{Finds whitepaper metadata and pdf download links with a basic keyword query using web-based databases such as Google and CiteSeerX.}
|
13
|
-
gem.homepage = ""
|
13
|
+
gem.homepage = "https://github.com/wilkie/whitepaper"
|
14
14
|
|
15
15
|
gem.files = `git ls-files`.split($/)
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whitepaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -61,11 +61,10 @@ files:
|
|
61
61
|
- lib/whitepaper/cli.rb
|
62
62
|
- lib/whitepaper/engine/citeseerx.rb
|
63
63
|
- lib/whitepaper/engine/google.rb
|
64
|
-
- lib/whitepaper/finder.rb
|
65
64
|
- lib/whitepaper/paper.rb
|
66
65
|
- lib/whitepaper/version.rb
|
67
66
|
- whitepaper.gemspec
|
68
|
-
homepage:
|
67
|
+
homepage: https://github.com/wilkie/whitepaper
|
69
68
|
licenses: []
|
70
69
|
post_install_message:
|
71
70
|
rdoc_options: []
|
data/lib/whitepaper/finder.rb
DELETED