rubyscholar 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +14 -4
- data/Rakefile +1 -0
- data/bin/rubyscholar +47 -0
- data/bin/scrape.rb +2 -4
- data/lib/rubyscholar/version.rb +3 -0
- data/lib/rubyscholar.rb +33 -33
- data/rubyscholar.gemspec +22 -0
- metadata +18 -11
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Yannick Wurm
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# Rubyscholar
|
2
|
+
|
1
3
|
# Synopsis
|
2
4
|
|
3
5
|
Here is a small script to "scrape" your Google Scholar citations and reformat them (the way I need it for my website).
|
@@ -11,12 +13,13 @@ Some features:
|
|
11
13
|
|
12
14
|
# How to use:
|
13
15
|
|
16
|
+
### As a ruby script:
|
14
17
|
1. Configure "config.yml"
|
15
18
|
If you want DOI retreival to work (including Altmetrics), you need to be
|
16
19
|
registered at crossref (its free).
|
17
|
-
2. Run `ruby bin/scrape.rb
|
18
|
-
3.
|
19
|
-
|
20
|
+
2. Run `ruby bin/scrape.rb`
|
21
|
+
3. A scholar.html file is created with your publications from google scholar.
|
22
|
+
4. Thats it.
|
20
23
|
|
21
24
|
# Potential for improvement:
|
22
25
|
|
@@ -36,5 +39,12 @@ RubyScholar was developed by Yannick Wurm (http://yannick.poulet.org). Pull requ
|
|
36
39
|
|
37
40
|
# Copyright
|
38
41
|
|
39
|
-
RubyScholar
|
42
|
+
RubyScholar © 2013 by Yannick Wurm. Licensed under the MIT license.
|
43
|
+
|
44
|
+
## Contributing
|
40
45
|
|
46
|
+
1. Fork it
|
47
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
48
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
49
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
50
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/rubyscholar
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'commander/import'
|
6
|
+
require 'rubyscholar'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
|
10
|
+
program :name, 'rubysholar'
|
11
|
+
program :version, '0.0.2'
|
12
|
+
program :description, 'Rubyscholar scrapes google scholar and formats it into a scholar.html file.'
|
13
|
+
|
14
|
+
default_command :scrape
|
15
|
+
|
16
|
+
command :scrape do |c|
|
17
|
+
c.syntax = 'rubyscholar scrape [options]'
|
18
|
+
c.summary = ''
|
19
|
+
c.description = "Scape google scholar for new publications"
|
20
|
+
|
21
|
+
c.option '--config [Config File]', 'Config file to use'
|
22
|
+
c.option '--out [Output File]', 'File to output the scrapes to'
|
23
|
+
|
24
|
+
c.action do |args, options|
|
25
|
+
options.default \
|
26
|
+
:config => 'config.yml',
|
27
|
+
:out => 'scholar.html'
|
28
|
+
config = YAML.load_file('config.yml')
|
29
|
+
parsed = Rubyscholar::Parser.new(config["url"],
|
30
|
+
config["email"])
|
31
|
+
formatter = Rubyscholar::Formatter.new(parsed,
|
32
|
+
config["highlight"],
|
33
|
+
config["pdfs"],
|
34
|
+
config["altmetricDOIs"],
|
35
|
+
config["minCitations"].to_i)
|
36
|
+
|
37
|
+
html = formatter.to_html
|
38
|
+
config["italicize"].each do |term|
|
39
|
+
html.gsub!( term , '<em>' + term + '</em>')
|
40
|
+
end
|
41
|
+
|
42
|
+
f= File.open('scholar.html','w')
|
43
|
+
f.write html
|
44
|
+
f.close()
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
data/bin/scrape.rb
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
require_relative '../lib/rubyscholar'
|
2
2
|
require 'yaml'
|
3
3
|
|
4
|
-
def scrape()
|
5
4
|
config = YAML.load_file('config.yml')
|
6
|
-
parsed =
|
5
|
+
parsed = Rubyscholar::Parser.new(config["url"],
|
7
6
|
config["email"])
|
8
|
-
formatter =
|
7
|
+
formatter = Rubyscholar::Formatter.new(parsed,
|
9
8
|
config["highlight"],
|
10
9
|
config["pdfs"],
|
11
10
|
config["altmetricDOIs"],
|
@@ -19,4 +18,3 @@ def scrape()
|
|
19
18
|
f= File.open('scholar.html','w')
|
20
19
|
f.write html
|
21
20
|
f.close
|
22
|
-
end
|
data/lib/rubyscholar.rb
CHANGED
@@ -1,24 +1,26 @@
|
|
1
|
+
require "rubyscholar/version"
|
1
2
|
require "nokogiri"
|
2
3
|
require "open-uri"
|
3
4
|
|
4
|
-
|
5
|
+
|
6
|
+
class String
|
5
7
|
def clean
|
6
8
|
# removes leading and trailing whitespace, commas
|
7
9
|
self.gsub!(/(^[\s,]+)|([\s,]+$)/, '')
|
8
10
|
return self
|
9
11
|
end
|
10
|
-
end
|
12
|
+
end
|
11
13
|
|
12
|
-
module
|
14
|
+
module Rubyscholar
|
13
15
|
class Paper < Struct.new(:title, :url, :authors, :journalName, :journalDetails, :year, :citationCount, :citingPapers, :doi)
|
14
|
-
end
|
15
|
-
|
16
|
+
end
|
17
|
+
|
16
18
|
class Parser
|
17
19
|
attr_accessor :parsedPapers, :crossRefEmail
|
18
|
-
|
20
|
+
|
19
21
|
def initialize(url, crossRefEmail = "")
|
20
22
|
@parsedPapers = []
|
21
|
-
@crossRefEmail = crossRefEmail # if nil doesn't
|
23
|
+
@crossRefEmail = crossRefEmail # if nil doesn't retursn any DOI
|
22
24
|
parse(url)
|
23
25
|
end
|
24
26
|
|
@@ -41,7 +43,7 @@ module RubyScholar
|
|
41
43
|
#citations
|
42
44
|
citeInfo = paper.css(".cit-dark-link")
|
43
45
|
citationCount = citeInfo.text
|
44
|
-
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
46
|
+
citationUrl = citationCount.empty? ? nil : citeInfo.attribute('href').to_s
|
45
47
|
|
46
48
|
# get DOI: needs last name of first author, no funny chars
|
47
49
|
lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
|
@@ -52,24 +54,24 @@ module RubyScholar
|
|
52
54
|
STDOUT << "Scraped #{parsedPapers.length} from Google Scholar.\n"
|
53
55
|
end
|
54
56
|
|
55
|
-
# Scholar doesn't provide DOI.
|
56
|
-
# But if registered at crossref (its free), DOI can be retreived.
|
57
|
+
# Scholar doesn't provide DOI.
|
58
|
+
# But if registered at crossref (its free), DOI can be retreived.
|
57
59
|
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
|
58
60
|
return '' if @crossRefEmail.nil?
|
59
|
-
sleep(1) # to reduce risk
|
61
|
+
sleep(1) # to reduce risk
|
60
62
|
STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
|
61
|
-
url = 'http://www.crossref.org/openurl?redirect=false' +
|
62
|
-
'&pid=' + crossRefEmail +
|
63
|
+
url = 'http://www.crossref.org/openurl?redirect=false' +
|
64
|
+
'&pid=' + crossRefEmail +
|
63
65
|
'&aulast=' + lastNameFirstAuthor +
|
64
66
|
'&atitle=' + URI.escape(title)
|
65
|
-
crossRefXML = Nokogiri::XML(open(url))
|
67
|
+
crossRefXML = Nokogiri::XML(open(url))
|
66
68
|
crossRefXML.search("doi").children.first.content rescue ''
|
67
69
|
end
|
68
70
|
end
|
69
|
-
|
71
|
+
|
70
72
|
class Formatter
|
71
73
|
attr_accessor :parser, :nameToHighlight, :pdfLinks, :altmetricDOIs
|
72
|
-
|
74
|
+
|
73
75
|
def initialize(parser, nameToHighlight = nil, pdfLinks = {}, altmetricDOIs = [], minCitationCount = 1)
|
74
76
|
@parser = parser
|
75
77
|
@nameToHighlight = nameToHighlight
|
@@ -79,47 +81,47 @@ module RubyScholar
|
|
79
81
|
end
|
80
82
|
|
81
83
|
def to_html
|
82
|
-
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
84
|
+
##@doc = Nokogiri::HTML::DocumentFragment.parse ""
|
83
85
|
builder = Nokogiri::HTML::Builder.new do |doc|
|
84
86
|
doc.html {
|
85
87
|
doc.body {
|
86
88
|
@parser.parsedPapers.each_with_index { |paper, index|
|
87
89
|
doc.div( :class => "publication") {
|
88
90
|
doc.p {
|
89
|
-
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
91
|
+
doc.text ((@parser.parsedPapers).length - index).to_s + '. '
|
92
|
+
|
93
|
+
doc.b paper[:title] + '.'
|
94
|
+
doc.text ' (' + paper[:year] + '). '
|
90
95
|
|
91
96
|
if paper[:authors].include?(@nameToHighlight)
|
92
97
|
doc.text( paper[:authors].sub(Regexp.new(@nameToHighlight + '.*'), '') )
|
93
|
-
doc.span( :class => "
|
98
|
+
doc.span( :class => "label label-info") { doc.text @nameToHighlight }
|
94
99
|
doc.text( paper[:authors].sub(Regexp.new('.*' + @nameToHighlight), '') )
|
95
100
|
else
|
96
101
|
doc.text( paper[:authors])
|
97
102
|
end
|
98
|
-
|
99
|
-
doc.text ' ' + paper[:year] + '. '
|
100
|
-
doc.b paper[:title] + '.'
|
103
|
+
|
101
104
|
doc.br
|
102
105
|
doc.em paper[:journalName]
|
103
106
|
doc.text ' '
|
104
107
|
doc.text paper[:journalDetails]
|
105
|
-
|
106
108
|
unless paper[ :doi].empty?
|
107
109
|
doc.text(' ')
|
108
|
-
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
109
|
-
doc.text "[DOI]"
|
110
|
-
}
|
110
|
+
doc.a( :href => URI.join("http://dx.doi.org/", paper[ :doi])) {
|
111
|
+
doc.text "[DOI]"
|
112
|
+
}
|
111
113
|
end
|
112
114
|
if @pdfLinks.keys.include?(paper[:title])
|
113
115
|
doc.text(' ')
|
114
|
-
doc.a( :href => @pdfLinks[paper[:title]]) {
|
116
|
+
doc.a( :href => @pdfLinks[paper[:title]]) {
|
115
117
|
doc.text "[PDF]"
|
116
|
-
}
|
118
|
+
}
|
117
119
|
end
|
118
120
|
if paper[ :citationCount].to_i > @minCitations
|
119
121
|
doc.text(' ')
|
120
|
-
doc.a( :href => paper[ :citingPapers]) {
|
121
|
-
doc.text("[Cited #{paper[ :citationCount]}x]")
|
122
|
-
}
|
122
|
+
doc.a( :href => paper[ :citingPapers]) {
|
123
|
+
doc.text("[Cited #{paper[ :citationCount]}x]")
|
124
|
+
}
|
123
125
|
end
|
124
126
|
if altmetricDOIs.include?( paper[ :doi])
|
125
127
|
doc.text(' ')
|
@@ -137,5 +139,3 @@ module RubyScholar
|
|
137
139
|
end
|
138
140
|
end
|
139
141
|
end
|
140
|
-
|
141
|
-
|
data/rubyscholar.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'rubyscholar/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "rubyscholar"
|
8
|
+
gem.version = Rubyscholar::VERSION
|
9
|
+
gem.authors = ["Yannick Wurm","Gaurav Koley"]
|
10
|
+
gem.email = ["y.wurm@qmul.ac.uk","arkokoley@live.in"]
|
11
|
+
gem.description = %q{Scrape Google Scholar}
|
12
|
+
gem.summary = %q{Rubyscholar scrapes google scholar and formats it into a scholar.html file.}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.add_dependency "nokogiri", "~>1.6.0"
|
16
|
+
gem.add_dependency "commander", "~>4.1.5"
|
17
|
+
|
18
|
+
gem.files = `git ls-files`.split($/)
|
19
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
20
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
21
|
+
gem.require_paths = ["lib"]
|
22
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyscholar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Yannick Wurm
|
9
|
+
- Gaurav Koley
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2013-
|
13
|
+
date: 2013-10-01 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: nokogiri
|
@@ -28,38 +29,44 @@ dependencies:
|
|
28
29
|
- !ruby/object:Gem::Version
|
29
30
|
version: 1.6.0
|
30
31
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
32
|
+
name: commander
|
32
33
|
requirement: !ruby/object:Gem::Requirement
|
33
34
|
none: false
|
34
35
|
requirements:
|
35
36
|
- - ~>
|
36
37
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
38
|
-
type: :
|
38
|
+
version: 4.1.5
|
39
|
+
type: :runtime
|
39
40
|
prerelease: false
|
40
41
|
version_requirements: !ruby/object:Gem::Requirement
|
41
42
|
none: false
|
42
43
|
requirements:
|
43
44
|
- - ~>
|
44
45
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
46
|
-
description:
|
47
|
-
them. It doesn't do a whole lot, but it's still useful.
|
46
|
+
version: 4.1.5
|
47
|
+
description: Scrape Google Scholar
|
48
48
|
email:
|
49
49
|
- y.wurm@qmul.ac.uk
|
50
|
+
- arkokoley@live.in
|
50
51
|
executables:
|
52
|
+
- rubyscholar
|
51
53
|
- scrape.rb
|
52
54
|
extensions: []
|
53
55
|
extra_rdoc_files: []
|
54
56
|
files:
|
55
57
|
- .gitignore
|
58
|
+
- Gemfile
|
59
|
+
- LICENSE.txt
|
56
60
|
- README.md
|
61
|
+
- Rakefile
|
62
|
+
- bin/rubyscholar
|
57
63
|
- bin/scrape.rb
|
58
64
|
- config.yml
|
59
65
|
- lib/rubyscholar.rb
|
66
|
+
- lib/rubyscholar/version.rb
|
67
|
+
- rubyscholar.gemspec
|
60
68
|
homepage: ''
|
61
|
-
licenses:
|
62
|
-
- MIT
|
69
|
+
licenses: []
|
63
70
|
post_install_message:
|
64
71
|
rdoc_options: []
|
65
72
|
require_paths:
|
@@ -81,5 +88,5 @@ rubyforge_project:
|
|
81
88
|
rubygems_version: 1.8.23
|
82
89
|
signing_key:
|
83
90
|
specification_version: 3
|
84
|
-
summary:
|
91
|
+
summary: Rubyscholar scrapes google scholar and formats it into a scholar.html file.
|
85
92
|
test_files: []
|