rubyscholar 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +14 -4
- data/Rakefile +1 -0
- data/bin/rubyscholar +47 -0
- data/bin/scrape.rb +2 -4
- data/lib/rubyscholar/version.rb +3 -0
- data/lib/rubyscholar.rb +33 -33
- data/rubyscholar.gemspec +22 -0
- metadata +18 -11
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Yannick Wurm
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# Rubyscholar
|
2
|
+
|
1
3
|
# Synopsis
|
2
4
|
|
3
5
|
Here is a small script to "scrape" your Google Scholar citations and reformat them (the way I need it for my website).
|
@@ -11,12 +13,13 @@ Some features:
|
|
11
13
|
|
12
14
|
# How to use:
|
13
15
|
|
16
|
+
### As a ruby script:
|
14
17
|
1. Configure "config.yml"
|
15
18
|
If you want DOI retreival to work (including Altmetrics), you need to be
|
16
19
|
registered at crossref (its free).
|
17
|
-
2. Run `ruby bin/scrape.rb
|
18
|
-
3.
|
19
|
-
|
20
|
+
2. Run `ruby bin/scrape.rb`
|
21
|
+
3. A scholar.html file is created with your publications from google scholar.
|
22
|
+
4. Thats it.
|
20
23
|
|
21
24
|
# Potential for improvement:
|
22
25
|
|
@@ -36,5 +39,12 @@ RubyScholar was developed by Yannick Wurm (http://yannick.poulet.org). Pull requ
|
|
36
39
|
|
37
40
|
# Copyright
|
38
41
|
|
39
|
-
RubyScholar
|
42
|
+
RubyScholar © 2013 by Yannick Wurm. Licensed under the MIT license.
|
43
|
+
|
44
|
+
## Contributing
|
40
45
|
|
46
|
+
1. Fork it
|
47
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
48
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
49
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
50
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/rubyscholar
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'commander/import'
|
6
|
+
require 'rubyscholar'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
|
10
|
+
program :name, 'rubysholar'
|
11
|
+
program :version, '0.0.2'
|
12
|
+
program :description, 'Rubyscholar scrapes google scholar and formats it into a scholar.html file.'
|
13
|
+
|
14
|
+
default_command :scrape
|
15
|
+
|
16
|
+
command :scrape do |c|
|
17
|
+
c.syntax = 'rubyscholar scrape [options]'
|
18
|
+
c.summary = ''
|
19
|
+
c.description = "Scape google scholar for new publications"
|
20
|
+
|
21
|
+
c.option '--config [Config File]', 'Config file to use'
|
22
|
+
c.option '--out [Output File]', 'File to output the scrapes to'
|
23
|
+
|
24
|
+
c.action do |args, options|
|
25
|
+
options.default \
|
26
|
+
:config => 'config.yml',
|
27
|
+
:out => 'scholar.html'
|
28
|
+
config = YAML.load_file('config.yml')
|
29
|
+
parsed = Rubyscholar::Parser.new(config["url"],
|
30
|
+
config["email"])
|
31
|
+
formatter = Rubyscholar::Formatter.new(parsed,
|
32
|
+
config["highlight"],
|
33
|
+
config["pdfs"],
|
34
|
+
config["altmetricDOIs"],
|
35
|
+
config["minCitations"].to_i)
|
36
|
+
|
37
|
+
html = formatter.to_html
|
38
|
+
config["italicize"].each do |term|
|
39
|
+
html.gsub!( term , '<em>' + term + '</em>')
|
40
|
+
end
|
41
|
+
|
42
|
+
f= File.open('scholar.html','w')
|
43
|
+
f.write html
|
44
|
+
f.close()
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
data/bin/scrape.rb
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
require_relative '../lib/rubyscholar'
|
2
2
|
require 'yaml'
|
3
3
|
|
4
|
-
def scrape()
|
5
4
|
config = YAML.load_file('config.yml')
|
6
|
-
parsed =
|
5
|
+
parsed = Rubyscholar::Parser.new(config["url"],
|
7
6
|
config["email"])
|
8
|
-
formatter =
|
7
|
+
formatter = Rubyscholar::Formatter.new(parsed,
|
9
8
|
config["highlight"],
|
10
9
|
config["pdfs"],
|
11
10
|
config["altmetricDOIs"],
|
@@ -19,4 +18,3 @@ def scrape()
|
|
19
18
|
f= File.open('scholar.html','w')
|
20
19
|
f.write html
|
21
20
|
f.close
|
22
|
-
end
|
data/lib/rubyscholar.rb
CHANGED
@@ -1,24 +1,26 @@
|
|
1
|
+
require "rubyscholar/version"
|
1
2
|
require "nokogiri"
|
2
3
|
require "open-uri"
|
3
4
|
|
4
|
-
|
5
|
+
|
6
|
+
class String
|
5
7
|
def clean
|
6
8
|
# removes leading and trailing whitespace, commas
|
7
9
|
self.gsub!(/(^[\s,]+)|([\s,]+$)/, '')
|
8
10
|
return self
|
9
11
|
end
|
10
|
-
end
|
12
|
+
end
|
11
13
|
|
12
|
-
module
|
14
|
+
module Rubyscholar
|
13
15
|
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
14
|
-
end
|
15
|
-
|
16
|
+
end
|
17
|
+
|
16
18
|
class Parser
|
17
19
|
attr_accessor :parsedPapers, :crossRefEmail
|
18
|
-
|
20
|
+
|
19
21
|
def initialize(url, crossRefEmail = "")
|
20
22
|
@parsedPapers = []
|
21
|
-
@crossRefEmail = crossRefEmail # if nil doesn't
|
23
|
+
@crossRefEmail = crossRefEmail # if nil doesn't retursn any DOI
|
22
24
|
parse(url)
|
23
25
|
end
|
24
26
|
|
@@ -41,7 +43,7 @@ module RubyScholar
|
|
41
43
|
#citations
|
42
44
|
citeInfo = paper.css(".cit-dark-link")
|
43
45
|
citationCount = citeInfo.text
|
44
|
-
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
46
|
+
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
45
47
|
|
46
48
|
# get DOI: needs last name of first author, no funny chars
|
47
49
|
lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
|
@@ -52,24 +54,24 @@ module RubyScholar
|
|
52
54
|
STDOUT << "Scraped #{parsedPapers.length} from Google Scholar.\n"
|
53
55
|
end
|
54
56
|
|
55
|
-
# Scholar doesn't provide DOI.
|
56
|
-
# But if registered at crossref (its free), DOI can be retreived.
|
57
|
+
# Scholar doesn't provide DOI.
|
58
|
+
# But if registered at crossref (its free), DOI can be retreived.
|
57
59
|
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
|
58
60
|
return '' if @crossRefEmail.nil?
|
59
|
-
sleep(1) # to reduce risk
|
61
|
+
sleep(1) # to reduce risk
|
60
62
|
STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
|
61
|
-
url = 'http://www.crossref.org/openurl?redirect=false' +
|
62
|
-
'&pid=' + crossRefEmail +
|
63
|
+
url = 'http://www.crossref.org/openurl?redirect=false' +
|
64
|
+
'&pid=' + crossRefEmail +
|
63
65
|
'&aulast=' + lastNameFirstAuthor +
|
64
66
|
'&atitle=' + URI.escape(title)
|
65
|
-
crossRefXML = Nokogiri::XML(open(url))
|
67
|
+
crossRefXML = Nokogiri::XML(open(url))
|
66
68
|
crossRefXML.search("doi").children.first.content rescue ''
|
67
69
|
end
|
68
70
|
end
|
69
|
-
|
71
|
+
|
70
72
|
class Formatter
|
71
73
|
attr_accessor :parser, :nameToHighlight, :pdfLinks, :altmetricDOIs
|
72
|
-
|
74
|
+
|
73
75
|
def initialize(parser, nameToHighlight = nil, pdfLinks = {}, altmetricDOIs = [], minCitationCount = 1)
|
74
76
|
@parser = parser
|
75
77
|
@nameToHighlight = nameToHighlight
|
@@ -79,47 +81,47 @@ module RubyScholar
|
|
79
81
|
end
|
80
82
|
|
81
83
|
def to_html
|
82
|
-
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
84
|
+
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
83
85
|
builder = Nokogiri::HTML::Builder.new do |doc|
|
84
86
|
doc.html {
|
85
87
|
doc.body {
|
86
88
|
@parser.parsedPapers.each_with_index { |paper, index|
|
87
89
|
doc.div( :class => "publication") {
|
88
90
|
doc.p {
|
89
|
-
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
91
|
+
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
92
|
+
|
93
|
+
doc.b paper[:title] + '.'
|
94
|
+
doc.text ' (' + paper[:year] + '). '
|
90
95
|
|
91
96
|
if paper[:authors].include?(@nameToHighlight)
|
92
97
|
doc.text( paper[:authors].sub(Regexp.new(@nameToHighlight + '.*'), '') )
|
93
|
-
doc.span( :class => "
|
98
|
+
doc.span( :class => "label label-info") { doc.text @nameToHighlight }
|
94
99
|
doc.text( paper[:authors].sub(Regexp.new('.*' + @nameToHighlight), '') )
|
95
100
|
else
|
96
101
|
doc.text( paper[:authors])
|
97
102
|
end
|
98
|
-
|
99
|
-
doc.text ' ' + paper[:year] + '. '
|
100
|
-
doc.b paper[:title] + '.'
|
103
|
+
|
101
104
|
doc.br
|
102
105
|
doc.em paper[:journalName]
|
103
106
|
doc.text ' '
|
104
107
|
doc.text paper[:journalDetails]
|
105
|
-
|
106
108
|
unless paper[ :doi].empty?
|
107
109
|
doc.text(' ')
|
108
|
-
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
109
|
-
doc.text "[DOI]"
|
110
|
-
}
|
110
|
+
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
111
|
+
doc.text "[DOI]"
|
112
|
+
}
|
111
113
|
end
|
112
114
|
if @pdfLinks.keys.include?(paper[:title])
|
113
115
|
doc.text(' ')
|
114
|
-
doc.a( :href => @pdfLinks[paper[:title]]) {
|
116
|
+
doc.a( :href => @pdfLinks[paper[:title]]) {
|
115
117
|
doc.text "[PDF]"
|
116
|
-
}
|
118
|
+
}
|
117
119
|
end
|
118
120
|
if paper[ :citationCount].to_i > @minCitations
|
119
121
|
doc.text(' ')
|
120
|
-
doc.a( :href => paper[ :citingPapers]) {
|
121
|
-
doc.text("[Cited #{paper[ :citationCount]}x]")
|
122
|
-
}
|
122
|
+
doc.a( :href => paper[ :citingPapers]) {
|
123
|
+
doc.text("[Cited #{paper[ :citationCount]}x]")
|
124
|
+
}
|
123
125
|
end
|
124
126
|
if altmetricDOIs.include?( paper[ :doi])
|
125
127
|
doc.text(' ')
|
@@ -137,5 +139,3 @@ module RubyScholar
|
|
137
139
|
end
|
138
140
|
end
|
139
141
|
end
|
140
|
-
|
141
|
-
|
data/rubyscholar.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'rubyscholar/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "rubyscholar"
|
8
|
+
gem.version = Rubyscholar::VERSION
|
9
|
+
gem.authors = ["Yannick Wurm","Gaurav Koley"]
|
10
|
+
gem.email = ["y.wurm@qmul.ac.uk","arkokoley@live.in"]
|
11
|
+
gem.description = %q{Scrape Google Scholar}
|
12
|
+
gem.summary = %q{Rubyscholar scrapes google scholar and formats it into a scholar.html file.}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.add_dependency "nokogiri", "~>1.6.0"
|
16
|
+
gem.add_dependency "commander", "~>4.1.5"
|
17
|
+
|
18
|
+
gem.files = `git ls-files`.split($/)
|
19
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
20
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
21
|
+
gem.require_paths = ["lib"]
|
22
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyscholar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Yannick Wurm
|
9
|
+
- Gaurav Koley
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2013-
|
13
|
+
date: 2013-10-01 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: nokogiri
|
@@ -28,38 +29,44 @@ dependencies:
|
|
28
29
|
- !ruby/object:Gem::Version
|
29
30
|
version: 1.6.0
|
30
31
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
32
|
+
name: commander
|
32
33
|
requirement: !ruby/object:Gem::Requirement
|
33
34
|
none: false
|
34
35
|
requirements:
|
35
36
|
- - ~>
|
36
37
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
38
|
-
type: :
|
38
|
+
version: 4.1.5
|
39
|
+
type: :runtime
|
39
40
|
prerelease: false
|
40
41
|
version_requirements: !ruby/object:Gem::Requirement
|
41
42
|
none: false
|
42
43
|
requirements:
|
43
44
|
- - ~>
|
44
45
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
46
|
-
description:
|
47
|
-
them. It doesn't do a whole lot, but it's still useful.
|
46
|
+
version: 4.1.5
|
47
|
+
description: Scrape Google Scholar
|
48
48
|
email:
|
49
49
|
- y.wurm@qmul.ac.uk
|
50
|
+
- arkokoley@live.in
|
50
51
|
executables:
|
52
|
+
- rubyscholar
|
51
53
|
- scrape.rb
|
52
54
|
extensions: []
|
53
55
|
extra_rdoc_files: []
|
54
56
|
files:
|
55
57
|
- .gitignore
|
58
|
+
- Gemfile
|
59
|
+
- LICENSE.txt
|
56
60
|
- README.md
|
61
|
+
- Rakefile
|
62
|
+
- bin/rubyscholar
|
57
63
|
- bin/scrape.rb
|
58
64
|
- config.yml
|
59
65
|
- lib/rubyscholar.rb
|
66
|
+
- lib/rubyscholar/version.rb
|
67
|
+
- rubyscholar.gemspec
|
60
68
|
homepage: ''
|
61
|
-
licenses:
|
62
|
-
- MIT
|
69
|
+
licenses: []
|
63
70
|
post_install_message:
|
64
71
|
rdoc_options: []
|
65
72
|
require_paths:
|
@@ -81,5 +88,5 @@ rubyforge_project:
|
|
81
88
|
rubygems_version: 1.8.23
|
82
89
|
signing_key:
|
83
90
|
specification_version: 3
|
84
|
-
summary:
|
91
|
+
summary: Rubyscholar scrapes google scholar and formats it into a scholar.html file.
|
85
92
|
test_files: []
|