rubyscholar 0.0.5 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +4 -12
- data/bin/rubyscholar +31 -57
- data/example.config.yml +51 -0
- data/lib/rubyscholar-main.rb +138 -0
- data/lib/{rubyscholar/version.rb → rubyscholar-version.rb} +1 -1
- data/lib/rubyscholar-version.rb~ +3 -0
- data/lib/{rubyscholar.rb → rubyscholar.rb~} +28 -29
- data/rubyscholar.gemspec +36 -19
- metadata +36 -28
- data/.gitignore +0 -18
- data/CHANGELOG.md +0 -6
- data/Rakefile +0 -1
- data/bin/scrape.rb +0 -20
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 68cc9491e42d735441cfd82e8e18c108a84ec05d
|
4
|
+
data.tar.gz: 5cc483aa8e259775a599b588cf658a219d5389f9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de91f4aa92392ce239fe9e093ca804f6725fa0ae05a1c6419ff49022cb3f12888961a8ea2f65c9fb35590b8b446620e5694385e2e35cf159b8beab632fc8e76d
|
7
|
+
data.tar.gz: face85b2e90f71e9d6c37e281a81a4e78011975e1ae820449fb2b995863d7195219ad4794fd7137a5420d2ae5a2f6b19433bc90a51f55883ea70a50d4d4b47f9
|
data/README.md
CHANGED
@@ -13,16 +13,6 @@ Some features:
|
|
13
13
|
|
14
14
|
# How to use:
|
15
15
|
|
16
|
-
### As a Ruby Gem:
|
17
|
-
1. Install the gem using: `[sudo] gem install rubyscholar`
|
18
|
-
2. Create and configure a `config.yml` file.
|
19
|
-
To create a `config.yml` file, run `$ rubyscholar init`
|
20
|
-
Edit the file, filling in your details.
|
21
|
-
3. Run as `$ rubyscholar scrape --out file.html `.
|
22
|
-
4. A `file.html` file is created containing your citations all formatted
|
23
|
-
and ready to use.
|
24
|
-
5. Done!
|
25
|
-
|
26
16
|
### As a ruby script:
|
27
17
|
1. Configure "config.yml"
|
28
18
|
If you want DOI retreival to work (including Altmetrics), you need to be
|
@@ -35,8 +25,10 @@ and ready to use.
|
|
35
25
|
|
36
26
|
* uses author list as visible on your main Google Scholar page. Sometimes this
|
37
27
|
means names are chopped in two or just a single author is missing. This could
|
38
|
-
be made smarter.
|
39
|
-
* flexible
|
28
|
+
be made smarter (by following the link to get the full author list).
|
29
|
+
* output format could be more flexible. (e.g. change order (eg title before authors), or change formatting (e.g. remove first initial)). Perhaps this could be done with by providing a regexp search/replace configuration option within each field.
|
30
|
+
* Ensure that a true email is entered.
|
31
|
+
* right now only works from "user profile" pages. Not from "articles citing article" pages.
|
40
32
|
* flexible use of DOIs
|
41
33
|
|
42
34
|
# Technologies
|
data/bin/rubyscholar
CHANGED
@@ -1,76 +1,50 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require 'optparse'
|
4
3
|
require 'rubygems'
|
4
|
+
require 'optparse' # YW is this needed or redundant with commander?
|
5
5
|
require 'commander/import'
|
6
|
-
require 'rubyscholar'
|
7
6
|
require 'yaml'
|
7
|
+
require 'rubyscholar-main'
|
8
8
|
|
9
9
|
|
10
|
-
program :name, '
|
11
|
-
program :version,
|
10
|
+
program :name, 'rubyscholar'
|
11
|
+
program :version, Rubyscholar::VERSION
|
12
12
|
program :description, 'Rubyscholar scrapes google scholar and formats it into a scholar.html file.'
|
13
13
|
|
14
14
|
default_command :scrape
|
15
15
|
|
16
16
|
command :scrape do |c|
|
17
17
|
c.syntax = 'rubyscholar scrape [options]'
|
18
|
-
c.
|
19
|
-
c.description = "Scape google scholar for new publications"
|
18
|
+
c.description = "Scrape Google Scholar for new publications"
|
20
19
|
|
21
20
|
c.option '--config [Config File]', 'Config file to use'
|
22
|
-
c.option '--
|
23
|
-
|
24
|
-
c.action do |args, options|
|
25
|
-
options.default :config => "config.yml", :out => "scholar.html"
|
26
|
-
configFile= "#{options.config}"
|
27
|
-
config = YAML.load_file(configFile)
|
28
|
-
parsed = Rubyscholar::Parser.new(config["url"],
|
29
|
-
config["email"])
|
30
|
-
formatter = Rubyscholar::Formatter.new(parsed,
|
31
|
-
config["highlight"],
|
32
|
-
config["pdfs"],
|
33
|
-
config["altmetricDOIs"],
|
34
|
-
config["minCitations"].to_i)
|
35
|
-
|
36
|
-
html = formatter.to_html
|
37
|
-
config["italicize"].each do |term|
|
38
|
-
html.gsub!( term , '<em>' + term + '</em>')
|
39
|
-
end
|
40
|
-
outFile="#{options.out}"
|
41
|
-
f= File.open(outFile,'w')
|
42
|
-
f.write html
|
43
|
-
f.close()
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
command :init do |c|
|
48
|
-
c.syntax = 'rubyscholar init'
|
49
|
-
c.description = 'Creates a sample config.yml file for Scraping.'
|
21
|
+
c.option '--output [Output File]', 'HTML output for publication list'
|
22
|
+
|
50
23
|
c.action do |args, options|
|
51
|
-
|
52
|
-
|
53
|
-
|
24
|
+
options.default \
|
25
|
+
:config => 'config.yaml',
|
26
|
+
:output => 'publications.html'
|
27
|
+
#rest stays in this block bc we need to access options
|
28
|
+
|
29
|
+
raise IOError, "You must specify config file via --config\n" if options.config.nil?
|
30
|
+
config = YAML.load_file(options.config)
|
31
|
+
parsed = Rubyscholar::Parser.new(config["url"],
|
32
|
+
config["email"])
|
33
|
+
html = Rubyscholar::Formatter.new(parsed,
|
34
|
+
config["highlight"],
|
35
|
+
config["pdfs"],
|
36
|
+
config["altmetricDOIs"],
|
37
|
+
config["minCitations"].to_i
|
38
|
+
).to_html
|
39
|
+
|
40
|
+
config["italicize"].each do |term|
|
41
|
+
html.gsub!( term , '<em>' + term + '</em>')
|
54
42
|
end
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
page.puts "\n\n# Need an Email address that has been registered with CrossRef to obtain DOIs using their OpenURL service. "
|
62
|
-
page.puts "# e.g. the following should provide an XML file: "
|
63
|
-
page.puts "# http://www.crossref.org/openurl?redirect=false&pid=YOUR@EMAIL>COM&aulast=Wurm&atitle=Behavioral%20Genomics:%20A,%20Bee,%20C,%20G,%20T"
|
64
|
-
page.puts "email: your@email.com"
|
65
|
-
page.puts "\n\n# Show \"[Cited Nx]\" if N > the following number"
|
66
|
-
page.puts "minCitations: 5 "
|
67
|
-
page.puts "\n\n# Words to italicize (emphasize). These will have \"<em>\" around them. "
|
68
|
-
page.puts "italicize: "
|
69
|
-
page.puts "\n\n# DOIs of articles for which we should show altmetric.org badges. "
|
70
|
-
page.puts "altmetricDOIs: "
|
71
|
-
page.puts "\n\n# Article titles for which we have urls to PDFs in \"name\" : \"url\" format"
|
72
|
-
page.puts "pdfs:"
|
73
|
-
end
|
74
|
-
end
|
43
|
+
|
44
|
+
STDERR << "Outputting to #{options.output}\n."
|
45
|
+
f = File.open(options.output,'w')
|
46
|
+
f.write html
|
47
|
+
f.close
|
48
|
+
end
|
75
49
|
end
|
76
50
|
|
data/example.config.yml
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# Google Scholar page (you can choose how you sort it)
|
2
|
+
url: "http://scholar.google.com/citations?hl=en&user=k6y0EGsAAAAJ&sortby=pubdate&view_op=list_works&pagesize=100"
|
3
|
+
|
4
|
+
#### Everything below this line is OPTIONAL ####
|
5
|
+
|
6
|
+
# Name to highlight
|
7
|
+
highlight: "Y Wurm"
|
8
|
+
|
9
|
+
|
10
|
+
# Need an Email address that has been registered with CrossRef to obtain DOIs
|
11
|
+
# using their OpenURL service.
|
12
|
+
# e.g. the following should provide an XML file:
|
13
|
+
# http://www.crossref.org/openurl?redirect=false&pid=YOUR@EMAIL>COM&aulast=Wurm&atitle=Behavioral%20Genomics:%20A,%20Bee,%20C,%20G,%20T
|
14
|
+
email: your@email.com
|
15
|
+
|
16
|
+
|
17
|
+
# Show "[Cited Nx]" if N > the following number
|
18
|
+
minCitations: 5
|
19
|
+
|
20
|
+
# Words to italicize (emphasize). These will have "<em>" around them.
|
21
|
+
italicize:
|
22
|
+
- Solenopsis invicta
|
23
|
+
- Acromyrmex echinatior
|
24
|
+
- de novo
|
25
|
+
|
26
|
+
# DOIs of articles for which we should show altmetric.org badges.
|
27
|
+
altmetricDOIs:
|
28
|
+
- "10.1038/nature11832"
|
29
|
+
- "10.1101/gr.121392.111"
|
30
|
+
- "10.1073/pnas.1009690108"
|
31
|
+
- "10.1073/pnas.1104825108"
|
32
|
+
|
33
|
+
# Article titles for which we have urls to PDFs
|
34
|
+
pdfs:
|
35
|
+
"A Y-like social chromosome causes alternative colony organization in fire ants" : "/publications/wangwurm2013socialChromosome.pdf"
|
36
|
+
"Duplication and concerted evolution in a master sex determiner under balancing selection" : "/publications/procb2013.pdf"
|
37
|
+
"Comparative genomics of chemosensory protein genes reveals rapid evolution and positive selection in ant-specific duplicates" : "/publications/hdy2012122a.pdf"
|
38
|
+
"The Molecular Clockwork of the Fire Ant Solenopsis invicta" : "/publications/ingram2012-fireAntClockGenes.pdf"
|
39
|
+
"Epigenetics: The Making of Ant Castes" : "/publications/2012CurrBiolAntepigenetics.pdf"
|
40
|
+
"Visualization and quality assessment of de novo genome assemblies" : "/publications/Bioinformatics-2011-Riba-Grognuz-3425-6"
|
41
|
+
"The genomic impact of 100 million years of social evolution in seven ant species" : "/publications/TiG2011.pdf"
|
42
|
+
"Relaxed selection is a precursor to the evolution of phenotypic plasticity" : "/publications/hunt2011phenotypicPlasticity.pdf"
|
43
|
+
"The genome of the leaf-cutting ant Acromyrmex echinatior suggests key adaptations to advanced social life and fungus farming" : "/publications/nygaard2011-acromyrmex-genome.pdf"
|
44
|
+
"Behind the Scenes of an Ant Genome Project" : "/publications/wurm2011antGenomeBehindTheScenes.pdf"
|
45
|
+
"The genome of the fire ant Solenopsis invicta" : "/publications/wurm2011fireAntGenome.pdf"
|
46
|
+
"Odorant Binding Proteins of the Red Imported Fire Ant, Solenopsis invicta: An Example of the Problems Facing the Analysis of Widely Divergent Proteins" : "/publications/gotzek2011obps.pdf"
|
47
|
+
"Parasitoid Wasps: From Natural History to Genomic Studies" : "/publications/wurm2010wasps.pdf"
|
48
|
+
"Changes in reproductive roles are associated with changes in gene expression in fire ant queens" : "/publications/wurm2010fireAntQueenDealationExpression.pdf"
|
49
|
+
"Fourmidable: a database for ant genomics" : "/publications/wurm2009antDatabase.pdf"
|
50
|
+
"Behavioral Genomics: A, Bee, C, G, T" : "/publications/wurm2007bees.pdf"
|
51
|
+
"An annotated cDNA library and microarray for large-scale gene-expression studies in the ant Solenopsis invicta" : "/publications/wang2007fireAntMicroarrays.pdf"
|
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rubyscholar-version'
|
4
|
+
|
5
|
+
|
6
|
+
class String
|
7
|
+
def clean
|
8
|
+
# removes leading and trailing whitespace, commas
|
9
|
+
self.gsub!(/(^[\s,]+)|([\s,]+$)/, '')
|
10
|
+
return self
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module Rubyscholar
|
15
|
+
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
16
|
+
end
|
17
|
+
|
18
|
+
class Parser
|
19
|
+
attr_accessor :parsedPapers, :crossRefEmail
|
20
|
+
|
21
|
+
def initialize(url, crossRefEmail = "")
|
22
|
+
@parsedPapers = []
|
23
|
+
@crossRefEmail = crossRefEmail # if nil doesn't return any DOI
|
24
|
+
parse(url)
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse(url)
|
28
|
+
STDERR << "Will check #{url}.\n"
|
29
|
+
page = Nokogiri::HTML(open(url,
|
30
|
+
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'), nil, 'utf-8')
|
31
|
+
papers = page.css(".gsc_a_tr")
|
32
|
+
STDERR << "Found #{papers.length} papers.\n"
|
33
|
+
papers.each do |paper|
|
34
|
+
title = paper.css(".gsc_a_at").text rescue ''
|
35
|
+
title.gsub!(/\.$/, '')
|
36
|
+
|
37
|
+
googleUrl = paper.children[0].children[0].attribute('href').text rescue ''
|
38
|
+
authors = paper.children[0].children[1].text.clean rescue ''
|
39
|
+
authors.gsub!("...", "et al")
|
40
|
+
|
41
|
+
journal = paper.children[0].children[2].text rescue ''
|
42
|
+
journalName = journal.split(/,|\d/).first.clean rescue ''
|
43
|
+
journalDetails = journal.gsub(journalName, '').clean
|
44
|
+
year = journalDetails.match(/, \d+$/)[0]
|
45
|
+
journalDetails = journalDetails.gsub(year, '').clean
|
46
|
+
year = year.clean
|
47
|
+
|
48
|
+
#citations
|
49
|
+
citeInfo = paper.css('.gsc_a_ac')
|
50
|
+
citationCount = citeInfo.text
|
51
|
+
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
52
|
+
|
53
|
+
# get DOI: needs last name of first author, no funny chars
|
54
|
+
lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
|
55
|
+
doi = getDoi( lastNameFirstAuthor, title, @crossRefEmail)
|
56
|
+
|
57
|
+
@parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
|
58
|
+
end
|
59
|
+
STDERR << "Scraped #{parsedPapers.length} from Google Scholar.\n"
|
60
|
+
end
|
61
|
+
|
62
|
+
# Scholar doesn't provide DOI.
|
63
|
+
# But if registered at crossref (its free), DOI can be retreived.
|
64
|
+
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
|
65
|
+
return '' if @crossRefEmail.nil?
|
66
|
+
sleep(1) # to reduce risk
|
67
|
+
STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
|
68
|
+
url = 'http://www.crossref.org/openurl?redirect=false' +
|
69
|
+
'&pid=' + crossRefEmail +
|
70
|
+
'&aulast=' + lastNameFirstAuthor +
|
71
|
+
'&atitle=' + URI.escape(title)
|
72
|
+
crossRefXML = Nokogiri::XML(open(url))
|
73
|
+
crossRefXML.search("doi").children.first.content rescue ''
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Formatter
|
78
|
+
attr_accessor :parser, :nameToHighlight, :pdfLinks, :altmetricDOIs
|
79
|
+
|
80
|
+
def initialize(parser, nameToHighlight = nil, pdfLinks = {}, altmetricDOIs = [], minCitationCount = 1)
|
81
|
+
@parser = parser
|
82
|
+
@nameToHighlight = nameToHighlight
|
83
|
+
@pdfLinks = pdfLinks
|
84
|
+
@altmetricDOIs = altmetricDOIs
|
85
|
+
@minCitations = minCitationCount
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_html
|
89
|
+
builder = Nokogiri::HTML::Builder.new do |doc|
|
90
|
+
doc.div( :class => "publication") {
|
91
|
+
doc.ol {
|
92
|
+
@parser.parsedPapers.each_with_index do |paper, index|
|
93
|
+
doc.li( :value=> ( (@parser.parsedPapers).length - index).to_s) {
|
94
|
+
doc.b paper[:title] + '.'
|
95
|
+
doc.text ' (' + paper[:year] + '). '
|
96
|
+
if paper[:authors].include?(@nameToHighlight)
|
97
|
+
doc.text( paper[:authors].sub(Regexp.new(@nameToHighlight + '.*'), '') )
|
98
|
+
doc.span( :class => "label") { doc.text @nameToHighlight }
|
99
|
+
doc.text( paper[:authors].sub(Regexp.new('.*' + @nameToHighlight), '') )
|
100
|
+
else
|
101
|
+
doc.text( paper[:authors]) + '.'
|
102
|
+
end
|
103
|
+
|
104
|
+
doc.em ' ' + paper[:journalName]
|
105
|
+
doc.text ' ' + paper[:journalDetails]
|
106
|
+
unless paper[ :doi].empty?
|
107
|
+
doc.text(' ')
|
108
|
+
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
109
|
+
doc.text "[DOI]"
|
110
|
+
}
|
111
|
+
end
|
112
|
+
if @pdfLinks.keys.include?(paper[:title])
|
113
|
+
doc.text(' ')
|
114
|
+
doc.a( :href => @pdfLinks[paper[:title]]) {
|
115
|
+
doc.text "[PDF]"
|
116
|
+
}
|
117
|
+
end
|
118
|
+
if paper[ :citationCount].to_i > @minCitations
|
119
|
+
doc.text(' ')
|
120
|
+
doc.a( :href => paper[ :citingPapers], :title => "Citations") {
|
121
|
+
doc.span( :class => "badge badge-inverse") { doc.test("#{paper[ :citationCount]}x") }
|
122
|
+
}
|
123
|
+
end
|
124
|
+
if altmetricDOIs.include?( paper[ :doi])
|
125
|
+
doc.text(' ')
|
126
|
+
doc.span( :class => 'altmetric-embed',
|
127
|
+
:'data-badge-popover' => 'bottom',
|
128
|
+
:'data-doi' => paper[ :doi] )
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
}
|
133
|
+
}
|
134
|
+
end
|
135
|
+
return builder.to_html
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -1,32 +1,31 @@
|
|
1
|
-
require "rubyscholar/version"
|
2
1
|
require "nokogiri"
|
3
2
|
require "open-uri"
|
4
3
|
|
5
|
-
|
4
|
+
module Rubyscholar
|
6
5
|
class String
|
7
6
|
def clean
|
8
7
|
# removes leading and trailing whitespace, commas
|
9
8
|
self.gsub!(/(^[\s,]+)|([\s,]+$)/, '')
|
10
9
|
return self
|
11
10
|
end
|
12
|
-
|
11
|
+
end
|
13
12
|
|
14
|
-
module Rubyscholar
|
15
|
-
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
16
|
-
end
|
17
13
|
|
14
|
+
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
15
|
+
end
|
16
|
+
|
18
17
|
class Parser
|
19
18
|
attr_accessor :parsedPapers, :crossRefEmail
|
20
|
-
|
19
|
+
|
21
20
|
def initialize(url, crossRefEmail = "")
|
22
21
|
@parsedPapers = []
|
23
|
-
@crossRefEmail = crossRefEmail # if nil doesn't
|
22
|
+
@crossRefEmail = crossRefEmail # if nil doesn't return any DOI
|
24
23
|
parse(url)
|
25
24
|
end
|
26
25
|
|
27
26
|
def parse(url)
|
28
27
|
papers = Nokogiri::HTML(open(url)).css(".cit-table .item")
|
29
|
-
|
28
|
+
STDERR << "Found #{papers.length} papers.\n"
|
30
29
|
papers.each do |paper|
|
31
30
|
paperDetails = paper.css("#col-title")
|
32
31
|
title = paperDetails[0].children[0].content.clean
|
@@ -43,7 +42,7 @@ module Rubyscholar
|
|
43
42
|
#citations
|
44
43
|
citeInfo = paper.css(".cit-dark-link")
|
45
44
|
citationCount = citeInfo.text
|
46
|
-
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
45
|
+
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
47
46
|
|
48
47
|
# get DOI: needs last name of first author, no funny chars
|
49
48
|
lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
|
@@ -51,27 +50,27 @@ module Rubyscholar
|
|
51
50
|
|
52
51
|
@parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
|
53
52
|
end
|
54
|
-
|
53
|
+
STDERR << "Scraped #{parsedPapers.length} from Google Scholar.\n"
|
55
54
|
end
|
56
55
|
|
57
|
-
# Scholar doesn't provide DOI.
|
58
|
-
# But if registered at crossref (its free), DOI can be retreived.
|
56
|
+
# Scholar doesn't provide DOI.
|
57
|
+
# But if registered at crossref (its free), DOI can be retreived.
|
59
58
|
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
|
60
59
|
return '' if @crossRefEmail.nil?
|
61
|
-
sleep(1) # to reduce risk
|
60
|
+
sleep(1) # to reduce risk
|
62
61
|
STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
|
63
|
-
url = 'http://www.crossref.org/openurl?redirect=false' +
|
64
|
-
'&pid=' + crossRefEmail +
|
62
|
+
url = 'http://www.crossref.org/openurl?redirect=false' +
|
63
|
+
'&pid=' + crossRefEmail +
|
65
64
|
'&aulast=' + lastNameFirstAuthor +
|
66
65
|
'&atitle=' + URI.escape(title)
|
67
|
-
crossRefXML = Nokogiri::XML(open(url))
|
66
|
+
crossRefXML = Nokogiri::XML(open(url))
|
68
67
|
crossRefXML.search("doi").children.first.content rescue ''
|
69
68
|
end
|
70
69
|
end
|
71
|
-
|
70
|
+
|
72
71
|
class Formatter
|
73
72
|
attr_accessor :parser, :nameToHighlight, :pdfLinks, :altmetricDOIs
|
74
|
-
|
73
|
+
|
75
74
|
def initialize(parser, nameToHighlight = nil, pdfLinks = {}, altmetricDOIs = [], minCitationCount = 1)
|
76
75
|
@parser = parser
|
77
76
|
@nameToHighlight = nameToHighlight
|
@@ -81,14 +80,14 @@ module Rubyscholar
|
|
81
80
|
end
|
82
81
|
|
83
82
|
def to_html
|
84
|
-
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
83
|
+
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
85
84
|
builder = Nokogiri::HTML::Builder.new do |doc|
|
86
85
|
doc.html {
|
87
86
|
doc.body {
|
88
87
|
@parser.parsedPapers.each_with_index { |paper, index|
|
89
88
|
doc.div( :class => "publication") {
|
90
89
|
doc.p {
|
91
|
-
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
90
|
+
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
92
91
|
|
93
92
|
doc.b paper[:title] + '.'
|
94
93
|
doc.text ' (' + paper[:year] + '). '
|
@@ -107,21 +106,21 @@ module Rubyscholar
|
|
107
106
|
doc.text paper[:journalDetails]
|
108
107
|
unless paper[ :doi].empty?
|
109
108
|
doc.text(' ')
|
110
|
-
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
111
|
-
doc.text "[DOI]"
|
112
|
-
}
|
109
|
+
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
110
|
+
doc.text "[DOI]"
|
111
|
+
}
|
113
112
|
end
|
114
113
|
if @pdfLinks.keys.include?(paper[:title])
|
115
114
|
doc.text(' ')
|
116
|
-
doc.a( :href => @pdfLinks[paper[:title]]) {
|
115
|
+
doc.a( :href => @pdfLinks[paper[:title]]) {
|
117
116
|
doc.text "[PDF]"
|
118
|
-
}
|
117
|
+
}
|
119
118
|
end
|
120
119
|
if paper[ :citationCount].to_i > @minCitations
|
121
120
|
doc.text(' ')
|
122
|
-
doc.a( :href => paper[ :citingPapers]) {
|
123
|
-
doc.text("[Cited #{paper[ :citationCount]}x]")
|
124
|
-
}
|
121
|
+
doc.a( :href => paper[ :citingPapers]) {
|
122
|
+
doc.text("[Cited #{paper[ :citationCount]}x]")
|
123
|
+
}
|
125
124
|
end
|
126
125
|
if altmetricDOIs.include?( paper[ :doi])
|
127
126
|
doc.text(' ')
|
data/rubyscholar.gemspec
CHANGED
@@ -1,23 +1,40 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'rubyscholar/version'
|
1
|
+
require './lib/rubyscholar-version.rb'
|
5
2
|
|
6
3
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
4
|
+
gem.name = 'rubyscholar'
|
8
5
|
gem.version = Rubyscholar::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.email = [
|
11
|
-
gem.description = %q{Scrape Google Scholar}
|
12
|
-
gem.summary = %q{Rubyscholar scrapes google scholar and formats it into a scholar.html file.}
|
13
|
-
gem.homepage =
|
14
|
-
gem.license =
|
15
|
-
|
16
|
-
|
17
|
-
gem.
|
18
|
-
|
19
|
-
|
20
|
-
gem.
|
21
|
-
gem.
|
22
|
-
gem.
|
6
|
+
gem.authors = ['Yannick Wurm','Gaurav Koley']
|
7
|
+
gem.email = ['y.wurm@qmul.ac.uk','arkokoley@live.in']
|
8
|
+
gem.description = %q{Scrape Google Scholar Profile page}
|
9
|
+
gem.summary = %q{Rubyscholar scrapes one google scholar and formats it into a scholar.html file listing publications.}
|
10
|
+
gem.homepage = 'http://yannick.poulet.org/'
|
11
|
+
gem.license = 'MIT'
|
12
|
+
|
13
|
+
|
14
|
+
gem.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6.0'
|
15
|
+
gem.add_runtime_dependency 'commander', '~> 4.1', '>= 4.1.5'
|
16
|
+
|
17
|
+
gem.files = Dir['lib/**/*'] + Dir['views/**/*'] + Dir['public/**/*'] + Dir['tests/**/*']
|
18
|
+
gem.files = gem.files + ['example.config.yml']
|
19
|
+
gem.files = gem.files + ['LICENSE.txt', 'README.md']
|
20
|
+
gem.files = gem.files + ['Gemfile', 'rubyscholar.gemspec']
|
21
|
+
|
22
|
+
gem.executables = ['rubyscholar']
|
23
|
+
gem.require_paths = ['lib']
|
24
|
+
|
25
|
+
gem.post_install_message = <<INFO
|
26
|
+
|
27
|
+
-----
|
28
|
+
Thanks for installing rubyscholar.
|
29
|
+
If something isn't working, this may be due to Google's changing the format of Scholar pages
|
30
|
+
(they do this regularly with no warning).
|
31
|
+
|
32
|
+
If you can have a shot at fixing it, please go ahead - pull requests are most welcome.
|
33
|
+
|
34
|
+
All the best,
|
35
|
+
|
36
|
+
Yannick - http://yannick.poulet.org
|
37
|
+
----
|
38
|
+
|
39
|
+
INFO
|
23
40
|
end
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyscholar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: '0.2'
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Yannick Wurm
|
@@ -10,84 +9,93 @@ authors:
|
|
10
9
|
autorequire:
|
11
10
|
bindir: bin
|
12
11
|
cert_chain: []
|
13
|
-
date:
|
12
|
+
date: 2014-09-20 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
15
|
name: nokogiri
|
17
16
|
requirement: !ruby/object:Gem::Requirement
|
18
|
-
none: false
|
19
17
|
requirements:
|
20
|
-
- - ~>
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.6'
|
21
|
+
- - ">="
|
21
22
|
- !ruby/object:Gem::Version
|
22
23
|
version: 1.6.0
|
23
24
|
type: :runtime
|
24
25
|
prerelease: false
|
25
26
|
version_requirements: !ruby/object:Gem::Requirement
|
26
|
-
none: false
|
27
27
|
requirements:
|
28
|
-
- - ~>
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.6'
|
31
|
+
- - ">="
|
29
32
|
- !ruby/object:Gem::Version
|
30
33
|
version: 1.6.0
|
31
34
|
- !ruby/object:Gem::Dependency
|
32
35
|
name: commander
|
33
36
|
requirement: !ruby/object:Gem::Requirement
|
34
|
-
none: false
|
35
37
|
requirements:
|
36
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '4.1'
|
41
|
+
- - ">="
|
37
42
|
- !ruby/object:Gem::Version
|
38
43
|
version: 4.1.5
|
39
44
|
type: :runtime
|
40
45
|
prerelease: false
|
41
46
|
version_requirements: !ruby/object:Gem::Requirement
|
42
|
-
none: false
|
43
47
|
requirements:
|
44
|
-
- - ~>
|
48
|
+
- - "~>"
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '4.1'
|
51
|
+
- - ">="
|
45
52
|
- !ruby/object:Gem::Version
|
46
53
|
version: 4.1.5
|
47
|
-
description: Scrape Google Scholar
|
54
|
+
description: Scrape Google Scholar Profile page
|
48
55
|
email:
|
49
56
|
- y.wurm@qmul.ac.uk
|
50
57
|
- arkokoley@live.in
|
51
58
|
executables:
|
52
59
|
- rubyscholar
|
53
|
-
- scrape.rb
|
54
60
|
extensions: []
|
55
61
|
extra_rdoc_files: []
|
56
62
|
files:
|
57
|
-
- .gitignore
|
58
|
-
- CHANGELOG.md
|
59
63
|
- Gemfile
|
60
64
|
- LICENSE.txt
|
61
65
|
- README.md
|
62
|
-
- Rakefile
|
63
66
|
- bin/rubyscholar
|
64
|
-
-
|
65
|
-
- lib/rubyscholar.rb
|
66
|
-
- lib/rubyscholar
|
67
|
+
- example.config.yml
|
68
|
+
- lib/rubyscholar-main.rb
|
69
|
+
- lib/rubyscholar-version.rb
|
70
|
+
- lib/rubyscholar-version.rb~
|
71
|
+
- lib/rubyscholar.rb~
|
67
72
|
- rubyscholar.gemspec
|
68
|
-
homepage: http://
|
73
|
+
homepage: http://yannick.poulet.org/
|
69
74
|
licenses:
|
70
75
|
- MIT
|
71
|
-
|
76
|
+
metadata: {}
|
77
|
+
post_install_message: "\n-----\nThanks for installing rubyscholar. \nIf something
|
78
|
+
isn't working, this may be due to Google's changing the format of Scholar pages\n(they
|
79
|
+
do this regularly with no warning). \n\nIf you can have a shot at fixing it, please
|
80
|
+
go ahead - pull requests are most welcome.\n\nAll the best, \n\nYannick - http://yannick.poulet.org\n----\n\n"
|
72
81
|
rdoc_options: []
|
73
82
|
require_paths:
|
74
83
|
- lib
|
75
84
|
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
-
none: false
|
77
85
|
requirements:
|
78
|
-
- -
|
86
|
+
- - ">="
|
79
87
|
- !ruby/object:Gem::Version
|
80
88
|
version: '0'
|
81
89
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
-
none: false
|
83
90
|
requirements:
|
84
|
-
- -
|
91
|
+
- - ">="
|
85
92
|
- !ruby/object:Gem::Version
|
86
93
|
version: '0'
|
87
94
|
requirements: []
|
88
95
|
rubyforge_project:
|
89
|
-
rubygems_version:
|
96
|
+
rubygems_version: 2.2.2
|
90
97
|
signing_key:
|
91
|
-
specification_version:
|
92
|
-
summary: Rubyscholar scrapes google scholar and formats it into a scholar.html
|
98
|
+
specification_version: 4
|
99
|
+
summary: Rubyscholar scrapes one google scholar and formats it into a scholar.html
|
100
|
+
file listing publications.
|
93
101
|
test_files: []
|
data/.gitignore
DELETED
data/CHANGELOG.md
DELETED
data/Rakefile
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
require "bundler/gem_tasks"
|
data/bin/scrape.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require_relative '../lib/rubyscholar'
|
2
|
-
require 'yaml'
|
3
|
-
|
4
|
-
config = YAML.load_file('config.yml')
|
5
|
-
parsed = Rubyscholar::Parser.new(config["url"],
|
6
|
-
config["email"])
|
7
|
-
formatter = Rubyscholar::Formatter.new(parsed,
|
8
|
-
config["highlight"],
|
9
|
-
config["pdfs"],
|
10
|
-
config["altmetricDOIs"],
|
11
|
-
config["minCitations"].to_i)
|
12
|
-
|
13
|
-
html = formatter.to_html
|
14
|
-
config["italicize"].each do |term|
|
15
|
-
html.gsub!( term , '<em>' + term + '</em>')
|
16
|
-
end
|
17
|
-
|
18
|
-
f= File.open('scholar.html','w')
|
19
|
-
f.write html
|
20
|
-
f.close
|