rubyscholar 0.0.5 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +4 -12
- data/bin/rubyscholar +31 -57
- data/example.config.yml +51 -0
- data/lib/rubyscholar-main.rb +138 -0
- data/lib/{rubyscholar/version.rb → rubyscholar-version.rb} +1 -1
- data/lib/rubyscholar-version.rb~ +3 -0
- data/lib/{rubyscholar.rb → rubyscholar.rb~} +28 -29
- data/rubyscholar.gemspec +36 -19
- metadata +36 -28
- data/.gitignore +0 -18
- data/CHANGELOG.md +0 -6
- data/Rakefile +0 -1
- data/bin/scrape.rb +0 -20
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 68cc9491e42d735441cfd82e8e18c108a84ec05d
|
4
|
+
data.tar.gz: 5cc483aa8e259775a599b588cf658a219d5389f9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de91f4aa92392ce239fe9e093ca804f6725fa0ae05a1c6419ff49022cb3f12888961a8ea2f65c9fb35590b8b446620e5694385e2e35cf159b8beab632fc8e76d
|
7
|
+
data.tar.gz: face85b2e90f71e9d6c37e281a81a4e78011975e1ae820449fb2b995863d7195219ad4794fd7137a5420d2ae5a2f6b19433bc90a51f55883ea70a50d4d4b47f9
|
data/README.md
CHANGED
@@ -13,16 +13,6 @@ Some features:
|
|
13
13
|
|
14
14
|
# How to use:
|
15
15
|
|
16
|
-
### As a Ruby Gem:
|
17
|
-
1. Install the gem using: `[sudo] gem install rubyscholar`
|
18
|
-
2. Create and configure a `config.yml` file.
|
19
|
-
To create a `config.yml` file, run `$ rubyscholar init`
|
20
|
-
Edit the file, filling in your details.
|
21
|
-
3. Run as `$ rubyscholar scrape --out file.html `.
|
22
|
-
4. A `file.html` file is created containing your citations all formatted
|
23
|
-
and ready to use.
|
24
|
-
5. Done!
|
25
|
-
|
26
16
|
### As a ruby script:
|
27
17
|
1. Configure "config.yml"
|
28
18
|
If you want DOI retreival to work (including Altmetrics), you need to be
|
@@ -35,8 +25,10 @@ and ready to use.
|
|
35
25
|
|
36
26
|
* uses author list as visible on your main Google Scholar page. Sometimes this
|
37
27
|
means names are chopped in two or just a single author is missing. This could
|
38
|
-
be made smarter.
|
39
|
-
* flexible
|
28
|
+
be made smarter (by following the link to get the full author list).
|
29
|
+
* output format could be more flexible. (e.g. change order (eg title before authors), or change formatting (e.g. remove first initial)). Perhaps this could be done with by providing a regexp search/replace configuration option within each field.
|
30
|
+
* Ensure that a true email is entered.
|
31
|
+
* right now only works from "user profile" pages. Not from "articles citing article" pages.
|
40
32
|
* flexible use of DOIs
|
41
33
|
|
42
34
|
# Technologies
|
data/bin/rubyscholar
CHANGED
@@ -1,76 +1,50 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require 'optparse'
|
4
3
|
require 'rubygems'
|
4
|
+
require 'optparse' # YW is this needed or redundant with commander?
|
5
5
|
require 'commander/import'
|
6
|
-
require 'rubyscholar'
|
7
6
|
require 'yaml'
|
7
|
+
require 'rubyscholar-main'
|
8
8
|
|
9
9
|
|
10
|
-
program :name, '
|
11
|
-
program :version,
|
10
|
+
program :name, 'rubyscholar'
|
11
|
+
program :version, Rubyscholar::VERSION
|
12
12
|
program :description, 'Rubyscholar scrapes google scholar and formats it into a scholar.html file.'
|
13
13
|
|
14
14
|
default_command :scrape
|
15
15
|
|
16
16
|
command :scrape do |c|
|
17
17
|
c.syntax = 'rubyscholar scrape [options]'
|
18
|
-
c.
|
19
|
-
c.description = "Scape google scholar for new publications"
|
18
|
+
c.description = "Scrape Google Scholar for new publications"
|
20
19
|
|
21
20
|
c.option '--config [Config File]', 'Config file to use'
|
22
|
-
c.option '--
|
23
|
-
|
24
|
-
c.action do |args, options|
|
25
|
-
options.default :config => "config.yml", :out => "scholar.html"
|
26
|
-
configFile= "#{options.config}"
|
27
|
-
config = YAML.load_file(configFile)
|
28
|
-
parsed = Rubyscholar::Parser.new(config["url"],
|
29
|
-
config["email"])
|
30
|
-
formatter = Rubyscholar::Formatter.new(parsed,
|
31
|
-
config["highlight"],
|
32
|
-
config["pdfs"],
|
33
|
-
config["altmetricDOIs"],
|
34
|
-
config["minCitations"].to_i)
|
35
|
-
|
36
|
-
html = formatter.to_html
|
37
|
-
config["italicize"].each do |term|
|
38
|
-
html.gsub!( term , '<em>' + term + '</em>')
|
39
|
-
end
|
40
|
-
outFile="#{options.out}"
|
41
|
-
f= File.open(outFile,'w')
|
42
|
-
f.write html
|
43
|
-
f.close()
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
command :init do |c|
|
48
|
-
c.syntax = 'rubyscholar init'
|
49
|
-
c.description = 'Creates a sample config.yml file for Scraping.'
|
21
|
+
c.option '--output [Output File]', 'HTML output for publication list'
|
22
|
+
|
50
23
|
c.action do |args, options|
|
51
|
-
|
52
|
-
|
53
|
-
|
24
|
+
options.default \
|
25
|
+
:config => 'config.yaml',
|
26
|
+
:output => 'publications.html'
|
27
|
+
#rest stays in this block bc we need to access options
|
28
|
+
|
29
|
+
raise IOError, "You must specify config file via --config\n" if options.config.nil?
|
30
|
+
config = YAML.load_file(options.config)
|
31
|
+
parsed = Rubyscholar::Parser.new(config["url"],
|
32
|
+
config["email"])
|
33
|
+
html = Rubyscholar::Formatter.new(parsed,
|
34
|
+
config["highlight"],
|
35
|
+
config["pdfs"],
|
36
|
+
config["altmetricDOIs"],
|
37
|
+
config["minCitations"].to_i
|
38
|
+
).to_html
|
39
|
+
|
40
|
+
config["italicize"].each do |term|
|
41
|
+
html.gsub!( term , '<em>' + term + '</em>')
|
54
42
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
page.puts "\n\n# Need an Email address that has been registered with CrossRef to obtain DOIs using their OpenURL service. "
|
62
|
-
page.puts "# e.g. the following should provide an XML file: "
|
63
|
-
page.puts "# http://www.crossref.org/openurl?redirect=false&pid=YOUR@EMAIL>COM&aulast=Wurm&atitle=Behavioral%20Genomics:%20A,%20Bee,%20C,%20G,%20T"
|
64
|
-
page.puts "email: your@email.com"
|
65
|
-
page.puts "\n\n# Show \"[Cited Nx]\" if N > the following number"
|
66
|
-
page.puts "minCitations: 5 "
|
67
|
-
page.puts "\n\n# Words to italicize (emphasize). These will have \"<em>\" around them. "
|
68
|
-
page.puts "italicize: "
|
69
|
-
page.puts "\n\n# DOIs of articles for which we should show altmetric.org badges. "
|
70
|
-
page.puts "altmetricDOIs: "
|
71
|
-
page.puts "\n\n# Article titles for which we have urls to PDFs in \"name\" : \"url\" format"
|
72
|
-
page.puts "pdfs:"
|
73
|
-
end
|
74
|
-
end
|
43
|
+
|
44
|
+
STDERR << "Outputting to #{options.output}\n."
|
45
|
+
f = File.open(options.output,'w')
|
46
|
+
f.write html
|
47
|
+
f.close
|
48
|
+
end
|
75
49
|
end
|
76
50
|
|
data/example.config.yml
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# Google Scholar page (you can choose how you sort it)
|
2
|
+
url: "http://scholar.google.com/citations?hl=en&user=k6y0EGsAAAAJ&sortby=pubdate&view_op=list_works&pagesize=100"
|
3
|
+
|
4
|
+
#### Everything below this line is OPTIONAL ####
|
5
|
+
|
6
|
+
# Name to highlight
|
7
|
+
highlight: "Y Wurm"
|
8
|
+
|
9
|
+
|
10
|
+
# Need an Email address that has been registered with CrossRef to obtain DOIs
|
11
|
+
# using their OpenURL service.
|
12
|
+
# e.g. the following should provide an XML file:
|
13
|
+
# http://www.crossref.org/openurl?redirect=false&pid=YOUR@EMAIL>COM&aulast=Wurm&atitle=Behavioral%20Genomics:%20A,%20Bee,%20C,%20G,%20T
|
14
|
+
email: your@email.com
|
15
|
+
|
16
|
+
|
17
|
+
# Show "[Cited Nx]" if N > the following number
|
18
|
+
minCitations: 5
|
19
|
+
|
20
|
+
# Words to italicize (emphasize). These will have "<em>" around them.
|
21
|
+
italicize:
|
22
|
+
- Solenopsis invicta
|
23
|
+
- Acromyrmex echinatior
|
24
|
+
- de novo
|
25
|
+
|
26
|
+
# DOIs of articles for which we should show altmetric.org badges.
|
27
|
+
altmetricDOIs:
|
28
|
+
- "10.1038/nature11832"
|
29
|
+
- "10.1101/gr.121392.111"
|
30
|
+
- "10.1073/pnas.1009690108"
|
31
|
+
- "10.1073/pnas.1104825108"
|
32
|
+
|
33
|
+
# Article titles for which we have urls to PDFs
|
34
|
+
pdfs:
|
35
|
+
"A Y-like social chromosome causes alternative colony organization in fire ants" : "/publications/wangwurm2013socialChromosome.pdf"
|
36
|
+
"Duplication and concerted evolution in a master sex determiner under balancing selection" : "/publications/procb2013.pdf"
|
37
|
+
"Comparative genomics of chemosensory protein genes reveals rapid evolution and positive selection in ant-specific duplicates" : "/publications/hdy2012122a.pdf"
|
38
|
+
"The Molecular Clockwork of the Fire Ant Solenopsis invicta" : "/publications/ingram2012-fireAntClockGenes.pdf"
|
39
|
+
"Epigenetics: The Making of Ant Castes" : "/publications/2012CurrBiolAntepigenetics.pdf"
|
40
|
+
"Visualization and quality assessment of de novo genome assemblies" : "/publications/Bioinformatics-2011-Riba-Grognuz-3425-6"
|
41
|
+
"The genomic impact of 100 million years of social evolution in seven ant species" : "/publications/TiG2011.pdf"
|
42
|
+
"Relaxed selection is a precursor to the evolution of phenotypic plasticity" : "/publications/hunt2011phenotypicPlasticity.pdf"
|
43
|
+
"The genome of the leaf-cutting ant Acromyrmex echinatior suggests key adaptations to advanced social life and fungus farming" : "/publications/nygaard2011-acromyrmex-genome.pdf"
|
44
|
+
"Behind the Scenes of an Ant Genome Project" : "/publications/wurm2011antGenomeBehindTheScenes.pdf"
|
45
|
+
"The genome of the fire ant Solenopsis invicta" : "/publications/wurm2011fireAntGenome.pdf"
|
46
|
+
"Odorant Binding Proteins of the Red Imported Fire Ant, Solenopsis invicta: An Example of the Problems Facing the Analysis of Widely Divergent Proteins" : "/publications/gotzek2011obps.pdf"
|
47
|
+
"Parasitoid Wasps: From Natural History to Genomic Studies" : "/publications/wurm2010wasps.pdf"
|
48
|
+
"Changes in reproductive roles are associated with changes in gene expression in fire ant queens" : "/publications/wurm2010fireAntQueenDealationExpression.pdf"
|
49
|
+
"Fourmidable: a database for ant genomics" : "/publications/wurm2009antDatabase.pdf"
|
50
|
+
"Behavioral Genomics: A, Bee, C, G, T" : "/publications/wurm2007bees.pdf"
|
51
|
+
"An annotated cDNA library and microarray for large-scale gene-expression studies in the ant Solenopsis invicta" : "/publications/wang2007fireAntMicroarrays.pdf"
|
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rubyscholar-version'
|
4
|
+
|
5
|
+
|
6
|
+
class String
|
7
|
+
def clean
|
8
|
+
# removes leading and trailing whitespace, commas
|
9
|
+
self.gsub!(/(^[\s,]+)|([\s,]+$)/, '')
|
10
|
+
return self
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module Rubyscholar
|
15
|
+
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
16
|
+
end
|
17
|
+
|
18
|
+
class Parser
|
19
|
+
attr_accessor :parsedPapers, :crossRefEmail
|
20
|
+
|
21
|
+
def initialize(url, crossRefEmail = "")
|
22
|
+
@parsedPapers = []
|
23
|
+
@crossRefEmail = crossRefEmail # if nil doesn't return any DOI
|
24
|
+
parse(url)
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse(url)
|
28
|
+
STDERR << "Will check #{url}.\n"
|
29
|
+
page = Nokogiri::HTML(open(url,
|
30
|
+
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'), nil, 'utf-8')
|
31
|
+
papers = page.css(".gsc_a_tr")
|
32
|
+
STDERR << "Found #{papers.length} papers.\n"
|
33
|
+
papers.each do |paper|
|
34
|
+
title = paper.css(".gsc_a_at").text rescue ''
|
35
|
+
title.gsub!(/\.$/, '')
|
36
|
+
|
37
|
+
googleUrl = paper.children[0].children[0].attribute('href').text rescue ''
|
38
|
+
authors = paper.children[0].children[1].text.clean rescue ''
|
39
|
+
authors.gsub!("...", "et al")
|
40
|
+
|
41
|
+
journal = paper.children[0].children[2].text rescue ''
|
42
|
+
journalName = journal.split(/,|\d/).first.clean rescue ''
|
43
|
+
journalDetails = journal.gsub(journalName, '').clean
|
44
|
+
year = journalDetails.match(/, \d+$/)[0]
|
45
|
+
journalDetails = journalDetails.gsub(year, '').clean
|
46
|
+
year = year.clean
|
47
|
+
|
48
|
+
#citations
|
49
|
+
citeInfo = paper.css('.gsc_a_ac')
|
50
|
+
citationCount = citeInfo.text
|
51
|
+
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
52
|
+
|
53
|
+
# get DOI: needs last name of first author, no funny chars
|
54
|
+
lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
|
55
|
+
doi = getDoi( lastNameFirstAuthor, title, @crossRefEmail)
|
56
|
+
|
57
|
+
@parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
|
58
|
+
end
|
59
|
+
STDERR << "Scraped #{parsedPapers.length} from Google Scholar.\n"
|
60
|
+
end
|
61
|
+
|
62
|
+
# Scholar doesn't provide DOI.
|
63
|
+
# But if registered at crossref (its free), DOI can be retreived.
|
64
|
+
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
|
65
|
+
return '' if @crossRefEmail.nil?
|
66
|
+
sleep(1) # to reduce risk
|
67
|
+
STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
|
68
|
+
url = 'http://www.crossref.org/openurl?redirect=false' +
|
69
|
+
'&pid=' + crossRefEmail +
|
70
|
+
'&aulast=' + lastNameFirstAuthor +
|
71
|
+
'&atitle=' + URI.escape(title)
|
72
|
+
crossRefXML = Nokogiri::XML(open(url))
|
73
|
+
crossRefXML.search("doi").children.first.content rescue ''
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Formatter
|
78
|
+
attr_accessor :parser, :nameToHighlight, :pdfLinks, :altmetricDOIs
|
79
|
+
|
80
|
+
def initialize(parser, nameToHighlight = nil, pdfLinks = {}, altmetricDOIs = [], minCitationCount = 1)
|
81
|
+
@parser = parser
|
82
|
+
@nameToHighlight = nameToHighlight
|
83
|
+
@pdfLinks = pdfLinks
|
84
|
+
@altmetricDOIs = altmetricDOIs
|
85
|
+
@minCitations = minCitationCount
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_html
|
89
|
+
builder = Nokogiri::HTML::Builder.new do |doc|
|
90
|
+
doc.div( :class => "publication") {
|
91
|
+
doc.ol {
|
92
|
+
@parser.parsedPapers.each_with_index do |paper, index|
|
93
|
+
doc.li( :value=> ( (@parser.parsedPapers).length - index).to_s) {
|
94
|
+
doc.b paper[:title] + '.'
|
95
|
+
doc.text ' (' + paper[:year] + '). '
|
96
|
+
if paper[:authors].include?(@nameToHighlight)
|
97
|
+
doc.text( paper[:authors].sub(Regexp.new(@nameToHighlight + '.*'), '') )
|
98
|
+
doc.span( :class => "label") { doc.text @nameToHighlight }
|
99
|
+
doc.text( paper[:authors].sub(Regexp.new('.*' + @nameToHighlight), '') )
|
100
|
+
else
|
101
|
+
doc.text( paper[:authors]) + '.'
|
102
|
+
end
|
103
|
+
|
104
|
+
doc.em ' ' + paper[:journalName]
|
105
|
+
doc.text ' ' + paper[:journalDetails]
|
106
|
+
unless paper[ :doi].empty?
|
107
|
+
doc.text(' ')
|
108
|
+
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
109
|
+
doc.text "[DOI]"
|
110
|
+
}
|
111
|
+
end
|
112
|
+
if @pdfLinks.keys.include?(paper[:title])
|
113
|
+
doc.text(' ')
|
114
|
+
doc.a( :href => @pdfLinks[paper[:title]]) {
|
115
|
+
doc.text "[PDF]"
|
116
|
+
}
|
117
|
+
end
|
118
|
+
if paper[ :citationCount].to_i > @minCitations
|
119
|
+
doc.text(' ')
|
120
|
+
doc.a( :href => paper[ :citingPapers], :title => "Citations") {
|
121
|
+
doc.span( :class => "badge badge-inverse") { doc.test("#{paper[ :citationCount]}x") }
|
122
|
+
}
|
123
|
+
end
|
124
|
+
if altmetricDOIs.include?( paper[ :doi])
|
125
|
+
doc.text(' ')
|
126
|
+
doc.span( :class => 'altmetric-embed',
|
127
|
+
:'data-badge-popover' => 'bottom',
|
128
|
+
:'data-doi' => paper[ :doi] )
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
}
|
133
|
+
}
|
134
|
+
end
|
135
|
+
return builder.to_html
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -1,32 +1,31 @@
|
|
1
|
-
require "rubyscholar/version"
|
2
1
|
require "nokogiri"
|
3
2
|
require "open-uri"
|
4
3
|
|
5
|
-
|
4
|
+
module Rubyscholar
|
6
5
|
class String
|
7
6
|
def clean
|
8
7
|
# removes leading and trailing whitespace, commas
|
9
8
|
self.gsub!(/(^[\s,]+)|([\s,]+$)/, '')
|
10
9
|
return self
|
11
10
|
end
|
12
|
-
|
11
|
+
end
|
13
12
|
|
14
|
-
module Rubyscholar
|
15
|
-
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
16
|
-
end
|
17
13
|
|
14
|
+
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
15
|
+
end
|
16
|
+
|
18
17
|
class Parser
|
19
18
|
attr_accessor :parsedPapers, :crossRefEmail
|
20
|
-
|
19
|
+
|
21
20
|
def initialize(url, crossRefEmail = "")
|
22
21
|
@parsedPapers = []
|
23
|
-
@crossRefEmail = crossRefEmail # if nil doesn't
|
22
|
+
@crossRefEmail = crossRefEmail # if nil doesn't return any DOI
|
24
23
|
parse(url)
|
25
24
|
end
|
26
25
|
|
27
26
|
def parse(url)
|
28
27
|
papers = Nokogiri::HTML(open(url)).css(".cit-table .item")
|
29
|
-
|
28
|
+
STDERR << "Found #{papers.length} papers.\n"
|
30
29
|
papers.each do |paper|
|
31
30
|
paperDetails = paper.css("#col-title")
|
32
31
|
title = paperDetails[0].children[0].content.clean
|
@@ -43,7 +42,7 @@ module Rubyscholar
|
|
43
42
|
#citations
|
44
43
|
citeInfo = paper.css(".cit-dark-link")
|
45
44
|
citationCount = citeInfo.text
|
46
|
-
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
45
|
+
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
47
46
|
|
48
47
|
# get DOI: needs last name of first author, no funny chars
|
49
48
|
lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
|
@@ -51,27 +50,27 @@ module Rubyscholar
|
|
51
50
|
|
52
51
|
@parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
|
53
52
|
end
|
54
|
-
|
53
|
+
STDERR << "Scraped #{parsedPapers.length} from Google Scholar.\n"
|
55
54
|
end
|
56
55
|
|
57
|
-
# Scholar doesn't provide DOI.
|
58
|
-
# But if registered at crossref (its free), DOI can be retreived.
|
56
|
+
# Scholar doesn't provide DOI.
|
57
|
+
# But if registered at crossref (its free), DOI can be retreived.
|
59
58
|
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
|
60
59
|
return '' if @crossRefEmail.nil?
|
61
|
-
sleep(1) # to reduce risk
|
60
|
+
sleep(1) # to reduce risk
|
62
61
|
STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
|
63
|
-
url = 'http://www.crossref.org/openurl?redirect=false' +
|
64
|
-
'&pid=' + crossRefEmail +
|
62
|
+
url = 'http://www.crossref.org/openurl?redirect=false' +
|
63
|
+
'&pid=' + crossRefEmail +
|
65
64
|
'&aulast=' + lastNameFirstAuthor +
|
66
65
|
'&atitle=' + URI.escape(title)
|
67
|
-
crossRefXML = Nokogiri::XML(open(url))
|
66
|
+
crossRefXML = Nokogiri::XML(open(url))
|
68
67
|
crossRefXML.search("doi").children.first.content rescue ''
|
69
68
|
end
|
70
69
|
end
|
71
|
-
|
70
|
+
|
72
71
|
class Formatter
|
73
72
|
attr_accessor :parser, :nameToHighlight, :pdfLinks, :altmetricDOIs
|
74
|
-
|
73
|
+
|
75
74
|
def initialize(parser, nameToHighlight = nil, pdfLinks = {}, altmetricDOIs = [], minCitationCount = 1)
|
76
75
|
@parser = parser
|
77
76
|
@nameToHighlight = nameToHighlight
|
@@ -81,14 +80,14 @@ module Rubyscholar
|
|
81
80
|
end
|
82
81
|
|
83
82
|
def to_html
|
84
|
-
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
83
|
+
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
85
84
|
builder = Nokogiri::HTML::Builder.new do |doc|
|
86
85
|
doc.html {
|
87
86
|
doc.body {
|
88
87
|
@parser.parsedPapers.each_with_index { |paper, index|
|
89
88
|
doc.div( :class => "publication") {
|
90
89
|
doc.p {
|
91
|
-
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
90
|
+
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
92
91
|
|
93
92
|
doc.b paper[:title] + '.'
|
94
93
|
doc.text ' (' + paper[:year] + '). '
|
@@ -107,21 +106,21 @@ module Rubyscholar
|
|
107
106
|
doc.text paper[:journalDetails]
|
108
107
|
unless paper[ :doi].empty?
|
109
108
|
doc.text(' ')
|
110
|
-
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
111
|
-
doc.text "[DOI]"
|
112
|
-
}
|
109
|
+
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
110
|
+
doc.text "[DOI]"
|
111
|
+
}
|
113
112
|
end
|
114
113
|
if @pdfLinks.keys.include?(paper[:title])
|
115
114
|
doc.text(' ')
|
116
|
-
doc.a( :href => @pdfLinks[paper[:title]]) {
|
115
|
+
doc.a( :href => @pdfLinks[paper[:title]]) {
|
117
116
|
doc.text "[PDF]"
|
118
|
-
}
|
117
|
+
}
|
119
118
|
end
|
120
119
|
if paper[ :citationCount].to_i > @minCitations
|
121
120
|
doc.text(' ')
|
122
|
-
doc.a( :href => paper[ :citingPapers]) {
|
123
|
-
doc.text("[Cited #{paper[ :citationCount]}x]")
|
124
|
-
}
|
121
|
+
doc.a( :href => paper[ :citingPapers]) {
|
122
|
+
doc.text("[Cited #{paper[ :citationCount]}x]")
|
123
|
+
}
|
125
124
|
end
|
126
125
|
if altmetricDOIs.include?( paper[ :doi])
|
127
126
|
doc.text(' ')
|
data/rubyscholar.gemspec
CHANGED
@@ -1,23 +1,40 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'rubyscholar/version'
|
1
|
+
require './lib/rubyscholar-version.rb'
|
5
2
|
|
6
3
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
4
|
+
gem.name = 'rubyscholar'
|
8
5
|
gem.version = Rubyscholar::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.email = [
|
11
|
-
gem.description = %q{Scrape Google Scholar}
|
12
|
-
gem.summary = %q{Rubyscholar scrapes google scholar and formats it into a scholar.html file.}
|
13
|
-
gem.homepage =
|
14
|
-
gem.license =
|
15
|
-
|
16
|
-
|
17
|
-
gem.
|
18
|
-
|
19
|
-
|
20
|
-
gem.
|
21
|
-
gem.
|
22
|
-
gem.
|
6
|
+
gem.authors = ['Yannick Wurm','Gaurav Koley']
|
7
|
+
gem.email = ['y.wurm@qmul.ac.uk','arkokoley@live.in']
|
8
|
+
gem.description = %q{Scrape Google Scholar Profile page}
|
9
|
+
gem.summary = %q{Rubyscholar scrapes one google scholar and formats it into a scholar.html file listing publications.}
|
10
|
+
gem.homepage = 'http://yannick.poulet.org/'
|
11
|
+
gem.license = 'MIT'
|
12
|
+
|
13
|
+
|
14
|
+
gem.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.0'
|
15
|
+
gem.add_runtime_dependency 'commander', '~> 4.1', '>= 4.1.5'
|
16
|
+
|
17
|
+
gem.files = Dir['lib/**/*'] + Dir['views/**/*'] + Dir['public/**/*'] + Dir['tests/**/*']
|
18
|
+
gem.files = gem.files + ['example.config.yml']
|
19
|
+
gem.files = gem.files + ['LICENSE.txt', 'README.md']
|
20
|
+
gem.files = gem.files + ['Gemfile', 'rubyscholar.gemspec']
|
21
|
+
|
22
|
+
gem.executables = ['rubyscholar']
|
23
|
+
gem.require_paths = ['lib']
|
24
|
+
|
25
|
+
gem.post_install_message = <<INFO
|
26
|
+
|
27
|
+
-----
|
28
|
+
Thanks for installing rubyscholar.
|
29
|
+
If something isn't working, this may be due to Google's changing the format of Scholar pages
|
30
|
+
(they do this regularly with no warning).
|
31
|
+
|
32
|
+
If you can have a shot at fixing it, please go ahead - pull requests are most welcome.
|
33
|
+
|
34
|
+
All the best,
|
35
|
+
|
36
|
+
Yannick - http://yannick.poulet.org
|
37
|
+
----
|
38
|
+
|
39
|
+
INFO
|
23
40
|
end
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyscholar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: '0.2'
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Yannick Wurm
|
@@ -10,84 +9,93 @@ authors:
|
|
10
9
|
autorequire:
|
11
10
|
bindir: bin
|
12
11
|
cert_chain: []
|
13
|
-
date:
|
12
|
+
date: 2014-09-20 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
15
|
name: nokogiri
|
17
16
|
requirement: !ruby/object:Gem::Requirement
|
18
|
-
none: false
|
19
17
|
requirements:
|
20
|
-
- - ~>
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.6'
|
21
|
+
- - ">="
|
21
22
|
- !ruby/object:Gem::Version
|
22
23
|
version: 1.6.0
|
23
24
|
type: :runtime
|
24
25
|
prerelease: false
|
25
26
|
version_requirements: !ruby/object:Gem::Requirement
|
26
|
-
none: false
|
27
27
|
requirements:
|
28
|
-
- - ~>
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.6'
|
31
|
+
- - ">="
|
29
32
|
- !ruby/object:Gem::Version
|
30
33
|
version: 1.6.0
|
31
34
|
- !ruby/object:Gem::Dependency
|
32
35
|
name: commander
|
33
36
|
requirement: !ruby/object:Gem::Requirement
|
34
|
-
none: false
|
35
37
|
requirements:
|
36
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '4.1'
|
41
|
+
- - ">="
|
37
42
|
- !ruby/object:Gem::Version
|
38
43
|
version: 4.1.5
|
39
44
|
type: :runtime
|
40
45
|
prerelease: false
|
41
46
|
version_requirements: !ruby/object:Gem::Requirement
|
42
|
-
none: false
|
43
47
|
requirements:
|
44
|
-
- - ~>
|
48
|
+
- - "~>"
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '4.1'
|
51
|
+
- - ">="
|
45
52
|
- !ruby/object:Gem::Version
|
46
53
|
version: 4.1.5
|
47
|
-
description: Scrape Google Scholar
|
54
|
+
description: Scrape Google Scholar Profile page
|
48
55
|
email:
|
49
56
|
- y.wurm@qmul.ac.uk
|
50
57
|
- arkokoley@live.in
|
51
58
|
executables:
|
52
59
|
- rubyscholar
|
53
|
-
- scrape.rb
|
54
60
|
extensions: []
|
55
61
|
extra_rdoc_files: []
|
56
62
|
files:
|
57
|
-
- .gitignore
|
58
|
-
- CHANGELOG.md
|
59
63
|
- Gemfile
|
60
64
|
- LICENSE.txt
|
61
65
|
- README.md
|
62
|
-
- Rakefile
|
63
66
|
- bin/rubyscholar
|
64
|
-
-
|
65
|
-
- lib/rubyscholar.rb
|
66
|
-
- lib/rubyscholar
|
67
|
+
- example.config.yml
|
68
|
+
- lib/rubyscholar-main.rb
|
69
|
+
- lib/rubyscholar-version.rb
|
70
|
+
- lib/rubyscholar-version.rb~
|
71
|
+
- lib/rubyscholar.rb~
|
67
72
|
- rubyscholar.gemspec
|
68
|
-
homepage: http://
|
73
|
+
homepage: http://yannick.poulet.org/
|
69
74
|
licenses:
|
70
75
|
- MIT
|
71
|
-
|
76
|
+
metadata: {}
|
77
|
+
post_install_message: "\n-----\nThanks for installing rubyscholar. \nIf something
|
78
|
+
isn't working, this may be due to Google's changing the format of Scholar pages\n(they
|
79
|
+
do this regularly with no warning). \n\nIf you can have a shot at fixing it, please
|
80
|
+
go ahead - pull requests are most welcome.\n\nAll the best, \n\nYannick - http://yannick.poulet.org\n----\n\n"
|
72
81
|
rdoc_options: []
|
73
82
|
require_paths:
|
74
83
|
- lib
|
75
84
|
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
-
none: false
|
77
85
|
requirements:
|
78
|
-
- -
|
86
|
+
- - ">="
|
79
87
|
- !ruby/object:Gem::Version
|
80
88
|
version: '0'
|
81
89
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
-
none: false
|
83
90
|
requirements:
|
84
|
-
- -
|
91
|
+
- - ">="
|
85
92
|
- !ruby/object:Gem::Version
|
86
93
|
version: '0'
|
87
94
|
requirements: []
|
88
95
|
rubyforge_project:
|
89
|
-
rubygems_version:
|
96
|
+
rubygems_version: 2.2.2
|
90
97
|
signing_key:
|
91
|
-
specification_version:
|
92
|
-
summary: Rubyscholar scrapes google scholar and formats it into a scholar.html
|
98
|
+
specification_version: 4
|
99
|
+
summary: Rubyscholar scrapes one google scholar and formats it into a scholar.html
|
100
|
+
file listing publications.
|
93
101
|
test_files: []
|
data/.gitignore
DELETED
data/CHANGELOG.md
DELETED
data/Rakefile
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
require "bundler/gem_tasks"
|
data/bin/scrape.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require_relative '../lib/rubyscholar'
|
2
|
-
require 'yaml'
|
3
|
-
|
4
|
-
config = YAML.load_file('config.yml')
|
5
|
-
parsed = Rubyscholar::Parser.new(config["url"],
|
6
|
-
config["email"])
|
7
|
-
formatter = Rubyscholar::Formatter.new(parsed,
|
8
|
-
config["highlight"],
|
9
|
-
config["pdfs"],
|
10
|
-
config["altmetricDOIs"],
|
11
|
-
config["minCitations"].to_i)
|
12
|
-
|
13
|
-
html = formatter.to_html
|
14
|
-
config["italicize"].each do |term|
|
15
|
-
html.gsub!( term , '<em>' + term + '</em>')
|
16
|
-
end
|
17
|
-
|
18
|
-
f= File.open('scholar.html','w')
|
19
|
-
f.write html
|
20
|
-
f.close
|