whitepaper 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ require 'mechanize'
2
+
3
+ module Whitepaper
4
+ module Engine
5
+ # This engine uses the ACM database to query metadata about a paper.
6
+ module ACM
7
+ # The domain to use for ACM.
8
+ DOMAIN = "https://dl.acm.org"
9
+
10
+ # The url to use to search by title.
11
+ SEARCH_BY_TITLE_URL2 = "results.cfm?within={title_query}&adv=1&DL=ACM&termzone=Title&allofem={title}"
12
+
13
+ SEARCH_BY_TITLE_URL = "results.cfm?query={title}&querydisp={title}&srt=score%20dsc&short=0&coll=DL&dl=GUIDE&source_disp=&source_query=&since_month=&since_year=&before_month=&before_year=&termshow=matchall&range_query="
14
+
15
+ class << self
16
+ # Returns a url that will query for the given title keywords.
17
+ def find_by_title_url(title)
18
+ "#{DOMAIN}/#{SEARCH_BY_TITLE_URL
19
+ .gsub(/\{title\}/, title.gsub(/\s/, "+"))
20
+ .gsub(/\{title_query\}/, "(Title:\"" + title.split(" ").join("\"+or+Title:\"") + "\")")}"
21
+ end
22
+
23
+ # Returns a Whitespace::Paper by searching for the paper with the given title keywords.
24
+ def find_by_title(title)
25
+ @agent = Mechanize.new
26
+
27
+ # In case cookies are ever necessary to establish:
28
+ #page = @agent.get("#{DOMAIN}")
29
+ #search_url = page.search('//form[@name="qiksearch"]').first.attribute("action").to_s
30
+
31
+ page = @agent.get(find_by_title_url(title))
32
+
33
+ # get the first link
34
+ paper = page.search '//a[@class="medium-text"]'
35
+
36
+ paper_link = "#{DOMAIN}/#{paper.first.attribute("href")}"
37
+
38
+ retrieve_details paper_link
39
+ end
40
+
41
+ # Returns a Whitespace::Paper by reading the direct page for a particular paper.
42
+ def retrieve_details(url)
43
+ @agent = Mechanize.new
44
+
45
+ page = @agent.get url
46
+
47
+ get_meta = lambda {|name|
48
+ meta = page.search "//meta[@name=\"#{name}\"]"
49
+ if meta.nil? or meta.first.nil?
50
+ return ""
51
+ end
52
+ meta.first.attribute("content").to_s
53
+ }
54
+
55
+ title = get_meta.call("citation_title")
56
+ authors_raw = get_meta.call("citation_authors")
57
+ year = get_meta.call("citation_date")
58
+ year = year[-4..-1] unless year.empty?
59
+ conference = get_meta.call("citation_conference")
60
+ publisher = get_meta.call("citation_publisher")
61
+
62
+ authors = authors_raw.to_s.split(';').map(&:strip).map do |s|
63
+ index = s.index(',')
64
+ if index > 0
65
+ "#{s[index+2..-1]} #{s[0..index-1]}"
66
+ else
67
+ s
68
+ end
69
+ end
70
+
71
+ links = []
72
+ ps_links = []
73
+
74
+ # get abstract
75
+ abstract_url = page.content.match(/tab_abstract\.cfm\?.*cftoken\=\d+/)[0]
76
+ abstract = @agent.get(abstract_url).root.text.to_s.strip
77
+
78
+ Paper.new title, authors, {:description => abstract,
79
+ :keywords => [],
80
+ :metadata_url => url,
81
+ :year => year,
82
+ :conference => conference,
83
+ :pdf_urls => links,
84
+ :ps_urls => ps_links}
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -42,7 +42,7 @@ module Whitepaper
42
42
  if meta.nil? or meta.first.nil?
43
43
  return ""
44
44
  end
45
- meta.first.attribute "content"
45
+ meta.first.attribute("content").to_s
46
46
  }
47
47
 
48
48
  description = get_meta.call("description")
@@ -60,32 +60,33 @@ module Whitepaper
60
60
 
61
61
  link_url = page.search '//ul[@id="clinks"]/li/a'
62
62
  link_url.each do |l|
63
- url = "#{DOMAIN}#{l.attribute("href").to_s}"
64
- if url.end_with? "pdf"
65
- links << url
63
+ purl = "#{DOMAIN}#{l.attribute("href").to_s}"
64
+ if purl.end_with? "pdf"
65
+ links << purl
66
66
  end
67
- if url.end_with? "ps"
68
- ps_links << url
67
+ if purl.end_with? "ps"
68
+ ps_links << purl
69
69
  end
70
70
  end
71
71
 
72
72
  link_url = page.search '//ul[@id="dlinks"]/li/a'
73
73
  link_url.each do |l|
74
- url = l.attribute("href").to_s
75
- if url.end_with? "pdf"
76
- links << url
74
+ purl = l.attribute("href").to_s
75
+ if purl.end_with? "pdf"
76
+ links << purl
77
77
  end
78
- if url.end_with? "ps"
79
- ps_links << url
78
+ if purl.end_with? "ps"
79
+ ps_links << purl
80
80
  end
81
81
  end
82
82
 
83
- Paper.new title, authors, {:description => description,
84
- :keywords => keywords,
85
- :year => year,
86
- :conference => conference,
87
- :pdf_urls => links,
88
- :ps_urls => ps_links}
83
+ Paper.new title, authors, {:description => description,
84
+ :keywords => keywords,
85
+ :year => year,
86
+ :conference => conference,
87
+ :metadata_url => url,
88
+ :pdf_urls => links,
89
+ :ps_urls => ps_links}
89
90
  end
90
91
  end
91
92
  end
@@ -0,0 +1,70 @@
1
+ module Whitepaper
2
+ module Engine
3
+ # This engine uses the IEEEXplore database to query metadata about a paper.
4
+ module IEEEXplore
5
+ DOMAIN = "http://ieeexplore.ieee.org"
6
+
7
+ SEARCH_BY_TITLE_URL = "search/searchresult.jsp?reload=true&newsearch=true&queryText={title}&x=60&y=7"
8
+
9
+ class << self
10
+ # Returns a url that will query for the given title keywords
11
+ def find_by_title_url(title)
12
+ "#{DOMAIN}/#{SEARCH_BY_TITLE_URL.gsub(/\{title\}/, title.gsub(/\s/, "+"))}"
13
+ end
14
+
15
+ # Returns a Whitespace::Paper by searching for the paper with the given title keywords.
16
+ def find_by_title(title)
17
+ @agent = Mechanize.new
18
+ page = @agent.get "#{find_by_title_url(title)}"
19
+
20
+ # get the first link
21
+ paper = page.search '//div[@class="detail"]/h3/a'
22
+
23
+ paper_link = "#{DOMAIN}#{paper.first.attribute("href")}"
24
+
25
+ retrieve_details paper_link
26
+ end
27
+
28
+ # Returns a Whitespace::Paper by reading the direct page for a particular paper.
29
+ def retrieve_details(url)
30
+ @agent = Mechanize.new
31
+
32
+ page = @agent.get url
33
+
34
+ get_meta = lambda {|name|
35
+ meta = page.search "//meta[@property=\"#{name}\"]"
36
+ if meta.nil? or meta.first.nil?
37
+ return ""
38
+ end
39
+ meta.first.attribute("content").to_s
40
+ }
41
+
42
+ keywords_raw = get_meta.call("keywords")
43
+ title = get_meta.call("citation_title")
44
+ year = get_meta.call("citation_date")
45
+ year = year[-4..-1] unless year.empty?
46
+ conference = get_meta.call("citation_conference")
47
+
48
+ authors = []
49
+ meta = page.search "//meta[@property=\"citation_author\"]"
50
+ meta.each do |e|
51
+ authors << e.attribute("content").to_s.strip
52
+ end
53
+
54
+ keywords = keywords_raw.to_s.split(';').map(&:strip)
55
+
56
+ links = []
57
+ ps_links = []
58
+
59
+ Paper.new title, authors, {:description => description,
60
+ :keywords => keywords,
61
+ :year => year,
62
+ :conference => conference,
63
+ :metadata_url => url,
64
+ :pdf_urls => links,
65
+ :ps_urls => ps_links}
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -19,6 +19,10 @@ module Whitepaper
19
19
  # The conference, if any, the paper appeared. Defaults to "".
20
20
  attr_reader :conference
21
21
 
22
+ # The link to the resource with the most metadata to use as attribution.
23
+ # Defaults to "".
24
+ attr_reader :metadata_url
25
+
22
26
  # A list of urls to pdf copies of the paper. Defaults to [].
23
27
  attr_reader :pdf_urls
24
28
 
@@ -34,6 +38,7 @@ module Whitepaper
34
38
  @keywords = options[:keywords] || []
35
39
  @year = options[:year] || ""
36
40
  @conference = options[:conference] || ""
41
+ @metadata_url = options[:metadata_url] || ""
37
42
 
38
43
  @pdf_urls = options[:pdf_urls] || []
39
44
  @ps_urls = options[:ps_urls] || []
@@ -70,6 +75,32 @@ module Whitepaper
70
75
  true
71
76
  end
72
77
 
78
+ # Gives a score of relevancy to the title keywords given. Higher scores
79
+ # mean that the keywords are more reflective of the title.
80
+ def score_by_title(keywords)
81
+ keywords = keywords.split(" ").map(&:strip).map(&:downcase)
82
+ title_words = title.split(" ").map(&:strip).map(&:downcase)
83
+
84
+ score = 1.0
85
+
86
+ # found words are worth x10
87
+ # not found words are worth /2
88
+
89
+ keywords.each do |k|
90
+ if title_words.include? k
91
+ score *= 10.0
92
+ end
93
+ end
94
+
95
+ title_words.each do |k|
96
+ unless keywords.include? k
97
+ score /= 2.0
98
+ end
99
+ end
100
+
101
+ score
102
+ end
103
+
73
104
  # Output a simple description of the paper metadata.
74
105
  def to_s
75
106
  "Title: #{@title}\n" +
@@ -78,6 +109,7 @@ module Whitepaper
78
109
  "Keywords: #{@keywords}\n" +
79
110
  "Year: #{@year}\n" +
80
111
  "Conference: #{@conference}\n" +
112
+ "More info: #{@metadata_url}\n" +
81
113
 
82
114
  "Pdf Available: #{@pdf_urls}\n" +
83
115
  "Ps Available: #{@ps_urls}"
@@ -1,4 +1,4 @@
1
1
  module Whitepaper
2
2
  # Version number for Whitepaper gem.
3
- VERSION = "0.0.2"
3
+ VERSION = "0.0.3"
4
4
  end
data/lib/whitepaper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "whitepaper/version"
2
2
 
3
+ require 'whitepaper/engine/acm'
3
4
  require 'whitepaper/engine/citeseerx'
4
5
  require 'whitepaper/engine/google'
5
6
 
@@ -12,7 +13,11 @@ module Whitepaper
12
13
  class << self
13
14
  # Find and return a Whitepaper::Paper by searching for a partial match with the given title.
14
15
  def find_by_title(title)
15
- paper = Engine::CiteSeerX.find_by_title(title)
16
+ paper_csx = Engine::CiteSeerX.find_by_title(title)
17
+ paper_acm = Engine::ACM.find_by_title(title)
18
+ paper_i3e = Engine::ACM.find_by_title(title)
19
+
20
+ paper = [paper_csx, paper_i3e, paper_acm].sort{|a,b| b.score_by_title(title) <=> a.score_by_title(title)}.first
16
21
 
17
22
  if paper.pdf_urls.empty?
18
23
  g = Engine::Google.find_by_title(title)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whitepaper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -59,8 +59,10 @@ files:
59
59
  - bin/whitepaper
60
60
  - lib/whitepaper.rb
61
61
  - lib/whitepaper/cli.rb
62
+ - lib/whitepaper/engine/acm.rb
62
63
  - lib/whitepaper/engine/citeseerx.rb
63
64
  - lib/whitepaper/engine/google.rb
65
+ - lib/whitepaper/engine/ieeexplore.rb
64
66
  - lib/whitepaper/paper.rb
65
67
  - lib/whitepaper/version.rb
66
68
  - whitepaper.gemspec